summaryrefslogtreecommitdiff
path: root/ggml/src/iqk/iqk_quantize.cpp
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-05-15 16:02:39 +0300
committerGitHub <noreply@github.com>2025-05-15 16:02:39 +0300
commit3d92d7f802b332927669f01bfa51ebbb56e868ba (patch)
treec3913f67e36492c723cc47fe512078ee0dd19d59 /ggml/src/iqk/iqk_quantize.cpp
parent3f8c865b920df844ba0cb4ba53c1ccce8874b045 (diff)
Adding IQ5_KS - 5.25 bpw quants (#422)
* iq5_ks: basics * iq5_ks: quantize * iq5_ks: CUDA dequantize works * iq5_ks: dot product works on CUDA * iq5_ks: MMQ works * iq5_ks: Zen4 * iq5_ks: AVX2 But is is not quite right, just like iq4_k, iq5_k, iq6_k, iq4_ks. All these need fixing on AVX2. * iq5_ks: NEON * iq5_ks: Metal dequantize * iq5_ks: Metal dot product --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp244
1 files changed, 244 insertions, 0 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index ca5e008a..78b25525 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -3419,6 +3419,250 @@ void vec_dot_iq4_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
}
namespace {
+static void quantize_row_iq5_ks_impl(const int super_block_size, const int block_size,
+ int n_per_row, const float * x, char * cy,
+ float * all_scales, float * weight,
+ const int8_t * values,
+ const float * quant_weights,
+ const int ntry) {
+
+ float * dptr = (float *)cy;
+ dptr[0] = 0;
+ block_iq5_ks * y = (block_iq5_ks *)(dptr + 1);
+
+ const int8_t * shifted_values = values + 32;
+
+ float amax_scale = 0;
+
+ for (int ibl = 0; ibl < n_per_row/super_block_size; ++ibl) {
+ memset(&y[ibl], 0, sizeof(block_iq5_ks));
+ const float * xbl = x + ibl*super_block_size;
+ auto scales = all_scales + ibl*(super_block_size/block_size);
+ float sigma2 = 0;
+ for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j];
+ sigma2 *= 2.f/super_block_size;
+ for (int ib = 0; ib < super_block_size/block_size; ++ib) {
+ const float * xb = xbl + ib*block_size;
+ if (quant_weights) {
+ const float * qw = quant_weights + ibl*super_block_size + ib*block_size;
+ for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+ } else {
+ for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
+ }
+ float amax = 0, max = 0;
+ for (int j = 0; j < block_size; ++j) {
+ float ax = fabsf(xb[j]);
+ if (ax > amax) {
+ amax = ax; max = xb[j];
+ }
+ }
+ if (amax < 1e-15f) {
+ scales[ib] = 0;
+ continue;
+ }
+ float d = ntry > 0 ? -max/values[0] : max/values[0];
+ float id = 1/d;
+ float sumqx_p = 0, sumq2_p = 0;
+ float sumqx_m = 0, sumq2_m = 0;
+ for (int j = 0; j < block_size; ++j) {
+ float w = weight[j];
+ float al = id*xb[j];
+ int l = best_index_iq5nl(values, al);
+ float q = values[l];
+ sumqx_p += w*q*xb[j];
+ sumq2_p += w*q*q;
+ l = best_index_iq5nl(values, -al);
+ q = values[l];
+ sumqx_m += w*q*xb[j];
+ sumq2_m += w*q*q;
+ }
+ d = sumqx_p/sumq2_p;
+ bool is_shifted = false;
+ float best = d*sumqx_p;
+ if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) {
+ d = sumqx_m/sumq2_m; best = d*sumqx_m;
+ }
+ for (int itry = -ntry; itry <= ntry; ++itry) {
+ id = (itry + values[0])/max;
+ sumqx_p = sumq2_p = 0;
+ sumqx_m = sumq2_m = 0;
+ for (int j = 0; j < block_size; ++j) {
+ float w = weight[j];
+ float al = id*xb[j];
+ int l = best_index_iq5nl(values, al);
+ float q = values[l];
+ sumqx_p += w*q*xb[j];
+ sumq2_p += w*q*q;
+ l = best_index_iq5nl(values, -al);
+ q = values[l];
+ sumqx_m += w*q*xb[j];
+ sumq2_m += w*q*q;
+ }
+ if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) {
+ d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false;
+ }
+ if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) {
+ d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false;
+ }
+ id = (itry + shifted_values[0])/max;
+ sumqx_p = sumq2_p = 0;
+ sumqx_m = sumq2_m = 0;
+ for (int j = 0; j < block_size; ++j) {
+ float w = weight[j];
+ float al = id*xb[j];
+ int l = best_index_iq5nl(shifted_values, al);
+ float q = shifted_values[l];
+ sumqx_p += w*q*xb[j];
+ sumq2_p += w*q*q;
+ l = best_index_iq5nl(shifted_values, -al);
+ q = shifted_values[l];
+ sumqx_m += w*q*xb[j];
+ sumq2_m += w*q*q;
+ }
+ if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) {
+ d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true;
+ }
+ if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) {
+ d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true;
+ }
+ }
+ if (is_shifted) y[ibl].scales[ib] = 0x01;
+ scales[ib] = d;
+ amax_scale = std::max(amax_scale, std::abs(d));
+ }
+ }
+ float d = amax_scale/127;
+ *dptr = d;
+ if (!d) return;
+ float id = d ? 1/d : 0.f;
+ float sumqx = 0, sumq2 = 0;
+ for (int ibl = 0; ibl < n_per_row/super_block_size; ++ibl) {
+ const float * xbl = x + ibl*super_block_size;
+ float sigma2 = 0;
+ for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j];
+ sigma2 *= 2.f/super_block_size;
+ auto scales = all_scales + (super_block_size/block_size)*ibl;
+ for (int ib = 0; ib < super_block_size/block_size; ++ib) {
+ const int8_t * block_values = y[ibl].scales[ib] & 0x01 ? shifted_values : values;
+ int l = nearest_int(0.5f*(id*scales[ib]+127.f));
+ l = std::max(0, std::min(127, l)) << 1;
+ y[ibl].scales[ib] |= l;
+ l -= 127;
+ float dl = d * l;
+ float idl = dl ? 1/dl : 0.f;
+ const float * xb = xbl + ib*block_size;
+ if (quant_weights) {
+ const float * qw = quant_weights + ibl*super_block_size + ib*block_size;
+ for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+ } else {
+ for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
+ }
+ for (int j = 0; j < block_size; ++j) {
+ uint8_t idx = best_index_iq5nl(block_values, idl*xb[j]);
+ y[ibl].qs[block_size*(ib/2) + j] |= ((idx & 0xf) << 4*(ib%2));
+ y[ibl].qh[j] |= ((idx >> 4) << ib);
+ float w = weight[j];
+ float q = block_values[idx]*l;
+ sumqx += w*q*xb[j];
+ sumq2 += w*q*q;
+ }
+ }
+ }
+ if (sumq2 > 0) *dptr = sumqx/sumq2;
+}
+}
+
+void quantize_row_iq5_ks_ref(const float * x, block_iq5_ks * y, int64_t k) {
+ quantize_iq5_ks(x, (void *)y, 1, k, nullptr);
+}
+
+void quantize_row_iq5_ks(const float * x, void * y, int64_t k) {
+ quantize_iq5_ks(x, (void *)y, 1, k, nullptr);
+}
+
+size_t quantize_iq5_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+ constexpr int kBlockSize = 32;
+ GGML_ASSERT(n_per_row%QK_K == 0);
+ auto row_size = ggml_row_size(GGML_TYPE_IQ5_KS, n_per_row);
+ char * qrow = (char *)dst;
+ float weight[kBlockSize];
+ std::vector<float> all_scales(n_per_row/kBlockSize);
+ for (int64_t row = 0; row < nrows; ++row) {
+ quantize_row_iq5_ks_impl(QK_K, kBlockSize, n_per_row, src, qrow, all_scales.data(), weight, iq5nl_values, imatrix, 5);
+ src += n_per_row;
+ qrow += row_size;
+ }
+ return nrows * row_size;
+}
+
+void dequantize_row_iq5_ks(const block_iq5_ks * x, float * y, int64_t k) {
+ constexpr int kBlockSize = 32;
+ GGML_ASSERT(k%QK_K == 0);
+ const float * dptr = (const float *)x;
+ float d = *dptr;
+ x = (const block_iq5_ks *)(dptr + 1);
+ int nblock = k/QK_K;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ auto qs = x[ibl].qs;
+ auto qh = x[ibl].qh;
+ for (int ib64 = 0; ib64 < QK_K/(2*kBlockSize); ++ib64) {
+ float dl1 = d * ((int)(x[ibl].scales[2*ib64+0] & 254) - 127);
+ float dl2 = d * ((int)(x[ibl].scales[2*ib64+1] & 254) - 127);
+ const int8_t * values1 = iq5nl_values + ((x[ibl].scales[2*ib64+0] & 1) << 5);
+ const int8_t * values2 = iq5nl_values + ((x[ibl].scales[2*ib64+1] & 1) << 5);
+ for (int j = 0; j < kBlockSize; ++j) {
+ y[j ] = dl1 * values1[(qs[j] & 0xf) | (((qh[j] >> (2*ib64+0)) & 1) << 4)];
+ y[j+kBlockSize] = dl2 * values2[(qs[j] >> 4) | (((qh[j] >> (2*ib64+1)) & 1) << 4)];
+ }
+ y += 2*kBlockSize;
+ qs += kBlockSize;
+ }
+ }
+}
+
+void vec_dot_iq5_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+ constexpr int kBlockSize = 32;
+#if GGML_USE_IQK_MULMAT
+ if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ5_KS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
+ return;
+ }
+#endif
+ GGML_ASSERT(n%QK_K == 0);
+ GGML_ASSERT(nrc == 1);
+ GGML_UNUSED(bs);
+ GGML_UNUSED(bx);
+ GGML_UNUSED(by);
+ const float * dptr = (const float *)vx;
+ const float d = *dptr;
+ const block_iq5_ks * x = (const block_iq5_ks *)(dptr + 1);
+ const block_q8_K * y = (const block_q8_K *)vy;
+ int nblock = n/QK_K;
+ float sumf = 0;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ auto qy = y[ibl].qs;
+ auto qs = x[ibl].qs;
+ auto qh = x[ibl].qh;
+ float db = d * y[ibl].d;
+ for (int ib64 = 0; ib64 < QK_K/(2*kBlockSize); ++ib64) {
+ float dl1 = db * ((int)(x[ibl].scales[2*ib64+0] & 254) - 127);
+ float dl2 = db * ((int)(x[ibl].scales[2*ib64+1] & 254) - 127);
+ const int8_t * values1 = iq5nl_values + ((x[ibl].scales[2*ib64+0] & 1) << 5);
+ const int8_t * values2 = iq5nl_values + ((x[ibl].scales[2*ib64+1] & 1) << 5);
+ int suml1 = 0;
+ int suml2 = 0;
+ for (int j = 0; j < kBlockSize; ++j) {
+ suml1 += qy[j ] * values1[(qs[j] & 0xf) | (((qh[j] >> (2*ib64+0)) & 1) << 4)];
+ suml2 += qy[j+kBlockSize] * values2[(qs[j] >> 4) | (((qh[j] >> (2*ib64+1)) & 1) << 4)];
+ }
+ sumf += dl1*suml1 + dl2*suml2;
+ y += 2*kBlockSize;
+ qs += kBlockSize;
+ }
+ }
+ *s = sumf;
+}
+
+namespace {
const uint16_t * scramble_table() {
static std::mutex mutex;
static std::vector<uint16_t> table;