summaryrefslogtreecommitdiff
path: root/ggml/src/iqk/iqk_quantize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp314
1 files changed, 313 insertions, 1 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 9095cda4..b38cc51f 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -1515,9 +1515,10 @@ void vec_dot_iq2_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
}
//
-// ============================================== iq3_k
+// ======================================== iq2_kl
//
namespace {
+
const int8_t iq3nl_index[111] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9,
9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 11, 11, 4, 4, 4, 4,
@@ -1531,6 +1532,317 @@ inline int best_index_iq3nl(const int8_t * values, float x) {
return ix < 8 ? ix : x - values[ix-8] < values[ix-7] - x ? ix-8 : ix-7;
}
+void quantize_row_iq2_kl_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales) {
+ constexpr int kBlockSize = 32;
+ constexpr float kSigmaFactor = 2.25f;
+ constexpr int ntry = 5;
+ static const int k_index[64] = {-1, -2, 0, -3, -4, 1, -5, -6, 2, -7, -8, 3, -9, 4, -10, 5, -11, 6, 7, -12, 8, 9, 10, -13, 11, -14, -15, -16, 12, 13, -17,
+ 14, -18, -19, 15, 16, 17, 18, 19, -20, -21, 20, 21, 22, 23, 24, -22, -23, 25, -24, 26, -25, 27, -26, 28, 29, -27, -28, 30, -29, -30, 31, -31, -32};
+ static const std::vector<std::vector<int>> k_neighbours = {
+ { 2, 0, 6, 11, 7, 3, 8, 15, },
+ { 0, 2, 3, 6, 7, 1, 8, 4, },
+ { 0, 1, 3, 4, 8, 7, 9, 6, },
+ { 1, 0, 3, 4, 8, 9, 7, 10, },
+ { 1, 4, 5, 10, 9, 3, 8, 0, },
+ { 5, 1, 4, 10, 9, 14, 8, 3, },
+ { 6, 2, 7, 0, 3, 11, 8, 15, },
+ { 3, 7, 0, 6, 8, 4, 12, 9, },
+ { 3, 4, 8, 9, 1, 7, 12, 10, },
+ { 4, 10, 5, 9, 1, 8, 13, 14, },
+ { 11, 2, 6, 7, 20, 15, 25, 21, },
+ { 8, 7, 3, 12, 9, 16, 17, 13, },
+ { 14, 5, 10, 19, 9, 13, 4, 18, },
+ { 6, 15, 7, 11, 20, 21, 16, 2, },
+ { 15, 7, 16, 6, 21, 12, 17, 22, },
+ { 12, 16, 17, 8, 15, 7, 13, 22, },
+ { 19, 10, 13, 18, 14, 9, 12, 24, },
+ { 11, 20, 25, 6, 15, 2, 21, 7, },
+ { 20, 15, 21, 6, 11, 7, 16, 26, },
+ { 14, 19, 29, 10, 28, 18, 13, 24, },
+ { 25, 11, 20, 21, 15, 6, 26, 30, },
+ { 19, 24, 28, 18, 29, 23, 13, 17, },
+ { 29, 19, 14, 28, 24, 18, 10, 13, },
+ { 20, 26, 21, 25, 30, 15, 22, 16, },
+ { 27, 26, 22, 23, 21, 30, 16, 24, },
+ { 27, 24, 28, 31, 23, 18, 22, 17, },
+ { 25, 30, 20, 26, 21, 11, 15, 22, },
+ { 30, 26, 25, 20, 21, 27, 22, 15, },
+ { 30, 27, 31, 26, 22, 23, 21, 24, },
+ { 31, 27, 30, 26, 28, 23, 22, 24, },
+ { 31, 28, 29, 27, 24, 23, 19, 18, },
+ { 29, 28, 31, 24, 19, 27, 14, 18, },
+ };
+ auto values = iq3nl_values;
+ std::pair<int8_t, int8_t> grid[32];
+ for (int j = 0; j < 64; ++j) {
+ if (int i = k_index[j]; i >= 0) {
+ int i1 = j/8, i2 = j%8;
+ grid[i] = {values[i1], values[i2]};
+ }
+ }
+
+ ggml_half * dptr = (ggml_half *)vy;
+ auto y = (block_iq2_kl *)(dptr + 1);
+
+ float weight[kBlockSize];
+
+ auto index = [&grid, values] (float id, float x1, float x2, float w1, float w2) {
+ float sx1 = id*x1;
+ float sx2 = id*x2;
+ int l1 = best_index_iq3nl(values, sx1);
+ int l2 = best_index_iq3nl(values, sx2);
+ int i = k_index[8*l1 + l2];
+ if (i >= 0) return i;
+ auto& neigh = k_neighbours[-i-1];
+ float best = std::numeric_limits<float>::max();
+ int ibest = -1;
+ for (auto& n : neigh) {
+ float diff1 = grid[n].first - sx1;
+ float diff2 = grid[n].second - sx2;
+ float score = w1*diff1*diff1 + w2*diff2*diff2;
+ if (score < best) {
+ best = score; ibest = n;
+ }
+ }
+ GGML_ASSERT(ibest >= 0);
+ return ibest;
+ };
+
+ float max_scale = 0, max_abs_scale = 0;
+
+ for (int ibl = 0; ibl < n_per_row/QK_K; ++ibl) {
+ std::memset(&y[ibl], 0, sizeof(block_iq2_kl));
+ auto scales = all_scales + ibl*(QK_K/kBlockSize);
+ auto xbl = x + ibl*QK_K;
+ float sigma2 = 0;
+ for (int j = 0; j < QK_K; ++j) sigma2 += xbl[j]*xbl[j];
+ sigma2 *= kSigmaFactor/QK_K;
+ for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
+ auto xb = xbl + ib*kBlockSize;
+ if (quant_weights) {
+ auto qw = quant_weights + ibl*QK_K + ib*kBlockSize;
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j]*sqrt(sigma2 + xb[j]*xb[j]);
+ } else {
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = std::abs(xb[j]); //xb[j]*xb[j];
+ }
+ float amax = 0, max = 0;
+ for (int j = 0; j < kBlockSize; ++j) {
+ float ax = std::abs(xb[j]);
+ if (ax > amax) {
+ amax = ax; max = xb[j];
+ }
+ }
+ if (!amax) {
+ scales[ib] = 0;
+ continue;
+ }
+ float d = ntry > 0 ? -max/values[0] : max/values[0];
+ float id = 1/d;
+ float sumqx_p = 0, sumq2_p = 0;
+ float sumqx_m = 0, sumq2_m = 0;
+ for (int j = 0; j < kBlockSize; j += 2) {
+ float w1 = weight[j+0];
+ float w2 = weight[j+1];
+ int idx = index(id, xb[j+0], xb[j+1], w1, w2);
+ float q1 = grid[idx].first ;
+ float q2 = grid[idx].second;
+ sumqx_p += w1*q1*xb[j] + w2*q2*xb[j+1];
+ sumq2_p += w1*q1*q1 + w2*q2*q2;
+ idx = index(-id, xb[j+0], xb[j+1], w1, w2);
+ q1 = grid[idx].first ;
+ q2 = grid[idx].second;
+ sumqx_m += w1*q1*xb[j] + w2*q2*xb[j+1];
+ sumq2_m += w1*q1*q1 + w2*q2*q2;
+ }
+ d = sumqx_p/sumq2_p;
+ float best = d*sumqx_p;
+ if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) {
+ d = sumqx_m/sumq2_m; best = d*sumqx_m;
+ }
+ for (int itry = -ntry; itry <= ntry; ++itry) {
+ id = (itry + values[0])/max;
+ sumqx_p = sumq2_p = 0;
+ sumqx_m = sumq2_m = 0;
+ for (int j = 0; j < kBlockSize; j += 2) {
+ float w1 = weight[j+0];
+ float w2 = weight[j+1];
+ int idx = index(id, xb[j+0], xb[j+1], w1, w2);
+ float q1 = grid[idx].first ;
+ float q2 = grid[idx].second;
+ sumqx_p += w1*q1*xb[j] + w2*q2*xb[j+1];
+ sumq2_p += w1*q1*q1 + w2*q2*q2;
+ idx = index(-id, xb[j+0], xb[j+1], w1, w2);
+ q1 = grid[idx].first ;
+ q2 = grid[idx].second;
+ sumqx_m += w1*q1*xb[j] + w2*q2*xb[j+1];
+ sumq2_m += w1*q1*q1 + w2*q2*q2;
+ }
+ if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) {
+ d = sumqx_p/sumq2_p; best = d * sumqx_p;
+ }
+ if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) {
+ d = sumqx_m/sumq2_m; best = d * sumqx_m;
+ }
+ }
+ scales[ib] = d;
+ float ad = std::abs(d);
+ if (ad > max_abs_scale) {
+ max_abs_scale = ad; max_scale = d;
+ }
+ }
+ }
+
+ if (!max_abs_scale) {
+ dptr[0] = GGML_FP32_TO_FP16(0.f);
+ return;
+ }
+
+ float d = -max_scale/32;
+ float id = 1/d;
+
+ float sumqx = 0, sumq2 = 0;
+ for (int ibl = 0; ibl < n_per_row/QK_K; ++ibl) {
+ auto scales = all_scales + ibl*(QK_K/kBlockSize);
+ auto xbl = x + ibl*QK_K;
+ float sigma2 = 0;
+ for (int j = 0; j < QK_K; ++j) sigma2 += xbl[j]*xbl[j];
+ sigma2 *= kSigmaFactor/QK_K;
+ for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
+ auto xb = xbl + ib*kBlockSize;
+ if (quant_weights) {
+ auto qw = quant_weights + ibl*QK_K + ib*kBlockSize;
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j]*sqrt(sigma2 + xb[j]*xb[j]);
+ } else {
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = std::abs(xb[j]); //xb[j]*xb[j];
+ }
+ int ls = nearest_int(id*scales[ib]);
+ ls = std::max(-32, std::min(31, ls));
+ int lsmin = std::max(-32, ls-1);
+ int lsmax = std::min( 31, ls+1);
+ float best_score = std::numeric_limits<float>::max();
+ int best_ls = ls;
+ for (int ils = lsmin; ils <= lsmax; ++ils) {
+ float dl = d*ils;
+ float idl = dl ? 1/dl : 0.f;
+ float score = 0;
+ for (int j = 0; j < kBlockSize/2; ++j) {
+ float w1 = weight[2*j+0];
+ float w2 = weight[2*j+1];
+ int idx = index(idl, xb[2*j+0], xb[2*j+1], w1, w2);
+ float diff1 = dl*grid[idx].first - xb[2*j+0];
+ float diff2 = dl*grid[idx].second - xb[2*j+1];
+ score += w1*diff1*diff1 + w2*diff2*diff2;
+ }
+ if (score < best_score) {
+ best_score = score;
+ best_ls = ils;
+ }
+ }
+ ls = best_ls;
+ int uls = ls + 32;
+ y[ibl].scales_l[ib%4] |= ((uls & 0xf) << 4*(ib/4));
+ y[ibl].scales_h |= ((uls >> 4) << 2*ib);
+ if (ls == 0) continue;
+ float dl = d*ls;
+ float idl = 1/dl;
+ for (int j = 0; j < kBlockSize/2; ++j) {
+ float w1 = weight[2*j+0];
+ float w2 = weight[2*j+1];
+ int idx = index(idl, xb[2*j+0], xb[2*j+1], w1, w2);
+ y[ibl].qs[16*(ib/2) + j] |= ((idx & 0xf) << 4*(ib%2));
+ y[ibl].qh[j] |= ((idx >> 4) << ib);
+ float q1 = ls*grid[idx].first ;
+ float q2 = ls*grid[idx].second;
+ sumqx += w1*q1*xb[2*j] + w2*q2*xb[2*j+1];
+ sumq2 += w1*q1*q1 + w2*q2*q2;
+ }
+ }
+ }
+ if (sumq2 > 0) d = sumqx/sumq2;
+
+ dptr[0] = GGML_FP32_TO_FP16(1.025f * d);
+
+}
+}
+
+void quantize_row_iq2_kl_ref(const float * x, block_iq2_kl * y, int64_t k) {
+ assert(k % QK_K == 0);
+ quantize_iq2_kl(x, (void *)y, 1, k, nullptr);
+}
+
+void quantize_row_iq2_kl(const float * x, void * vy, int64_t k) {
+ assert(k % QK_K == 0);
+ block_iq2_kl * y = (block_iq2_kl *)vy;
+ quantize_row_iq2_kl_ref(x, y, k);
+}
+
+size_t quantize_iq2_kl(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+ constexpr int kBlockSize = 32;
+ GGML_ASSERT(n_per_row%QK_K == 0);
+ auto row_size = ggml_row_size(GGML_TYPE_IQ2_KL, n_per_row);
+ int nblock = n_per_row/QK_K;
+ std::vector<float> all_scales(nblock*(QK_K/kBlockSize));
+ char * qrow = (char *)dst;
+ for (int64_t row = 0; row < nrows; ++row) {
+ quantize_row_iq2_kl_impl(src, (void *)qrow, n_per_row, imatrix, all_scales.data());
+ src += n_per_row;
+ qrow += row_size;
+ }
+ return nrows * row_size;
+}
+
+void dequantize_row_iq2_kl(const block_iq2_kl * x, float * y, int64_t k) {
+ assert(k % QK_K == 0);
+ const int nb = k / QK_K;
+
+ const ggml_half * dptr = (const ggml_half *)x;
+ const float d = GGML_FP16_TO_FP32(*dptr);
+ x = (const block_iq2_kl *)(dptr + 1);
+
+ for (int i = 0; i < nb; i++) {
+
+ auto qs = x[i].qs;
+ auto qh = x[i].qh;
+ auto scales_h = x[i].scales_h;
+
+ for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+ float dl1 = d * (int(((x[i].scales_l[(2*ib64+0)%4] >> 4*(ib64/2)) & 0xf) | (((scales_h >> (4*ib64+0)) & 3) << 4)) - 32);
+ float dl2 = d * (int(((x[i].scales_l[(2*ib64+1)%4] >> 4*(ib64/2)) & 0xf) | (((scales_h >> (4*ib64+2)) & 3) << 4)) - 32);
+ for (int j = 0; j < 16; ++j) {
+ const int8_t * val1 = (const int8_t *)(iq2kl_values + ((qs[j] & 0xf) | (((qh[j] >> (2*ib64+0)) & 1) << 4)));
+ const int8_t * val2 = (const int8_t *)(iq2kl_values + ((qs[j] >> 4) | (((qh[j] >> (2*ib64+1)) & 1) << 4)));
+ y[2*j+ 0] = dl1 * val1[0];
+ y[2*j+ 1] = dl1 * val1[1];
+ y[2*j+32] = dl2 * val2[0];
+ y[2*j+33] = dl2 * val2[1];
+ }
+ y += 64;
+ qs += 16;
+ }
+
+ }
+}
+
+void vec_dot_iq2_kl_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ GGML_UNUSED(nrc);
+ GGML_UNUSED(bx);
+ GGML_UNUSED(by);
+ GGML_UNUSED(bs);
+
+#if GGML_USE_IQK_MULMAT
+ if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_KL, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
+ return;
+ }
+#endif
+}
+
+//
+// ============================================== iq3_k
+//
+namespace {
+
static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) {
constexpr int ntry = 3;