IQ2_KS: 2.1875 bpw non-linear quantization (#85)

* Experimenting * iq2k: Try make_qx_quants for the scale Slightly better for LLaMA-3.1, Gemma-2, slightly worse for Qwen2.5 * iq2k with make_qx_quants: adjust scale * iq2ks: basics * iq2_ks: CUDA works * iq2_ks: WIP * iq2_ks: WIP * iq2_ks: Zen4 * iq2_ks: AVX2 * iq2_ks: scalar dot product * iq2_ks: ARM_NEON * iq2_ks: Metal * iq2_ks: faster Metal LLaMA-3.1-8B: PP-512 = 475.22 ± 0.37 t/s TG-128 = 45.32 ± 0.03 t/s --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2024-10-13 13:34:30 +0300
committer: GitHub <noreply@github.com> 2024-10-13 13:34:30 +0300
commit: 910a13409463f7aedb0a92be013a1b9bb50f4859 (patch)
tree: 16e13e1fd3010549877408a0a62706b2bc5d5f0c /ggml/src/ggml-quants.c
parent: c15de3654e0002537c8052fd6d52d879e778e88c (diff)
1 files changed, 40 insertions, 10 deletions
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 40978ac0..a845eaf5 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -12873,7 +12873,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
     const int      * kmap_q2xs       = iq2_data[gindex].map;
     const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
 
-    GGML_ASSERT(quant_weights   && "missing quantization weights");
     GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
@@ -12908,8 +12907,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
 
         for (int ib = 0; ib < QK_K/32; ++ib) {
             const float * xb = xbl + 32*ib;
-            const float * qw = quant_weights + QK_K*ibl + 32*ib;
-            for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*ibl + 32*ib;
+                for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            } else {
+                for (int i = 0; i < 32; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
+            }
             for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
             for (int k = 0; k < 4; ++k) {
                 int nflip = 0;
@@ -13046,7 +13049,6 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
     const int      * kmap_q2xs       = iq2_data[gindex].map;
     const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
 
-    GGML_ASSERT(quant_weights   && "missing quantization weights");
     GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
@@ -13084,8 +13086,12 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
 
         for (int ib = 0; ib < QK_K/16; ++ib) {
             const float * xb = xbl + 16*ib;
-            const float * qw = quant_weights + QK_K*ibl + 16*ib;
-            for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*ibl + 16*ib;
+                for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            } else {
+                for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
+            }
             for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
             for (int k = 0; k < 2; ++k) {
                 int nflip = 0;
@@ -13230,6 +13236,17 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq2_xxs);
 }
 
+void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_iq2_xxs * restrict y = vy;
+    quantize_row_iq2_xxs_ref(x, y, k);
+}
+
+void quantize_row_iq2_xxs_ref(const float * restrict x, block_iq2_xxs * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_iq2_xxs(x, y, 1, k, NULL);
+}
+
 size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
@@ -13242,6 +13259,17 @@ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq2_xs);
 }
 
+void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_iq2_xs * restrict y = vy;
+    quantize_row_iq2_xs_ref(x, y, k);
+}
+
+void quantize_row_iq2_xs_ref(const float * restrict x, block_iq2_xs * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_iq2_xs(x, y, 1, k, NULL);
+}
+
 //
 // ============================================= 3-bit using D4 lattice
 //
@@ -14947,10 +14975,11 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         return false;
     }
 
-    if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && nbytes % ggml_type_size(type) != 0) {
-        fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
-        return false;
-    }
+    // Who needs this?
+    //if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && nbytes % ggml_type_size(type) != 0) {
+    //    fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
+    //    return false;
+    //}
 
     const size_t nb = nbytes/ggml_type_size(type);
 
@@ -15160,6 +15189,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
             } break;
         case GGML_TYPE_Q6_0: break;
         case GGML_TYPE_IQ2_K: break;
+        case GGML_TYPE_IQ2_KS: break;
         case GGML_TYPE_IQ3_K: break;
         case GGML_TYPE_IQ4_K: break;
         case GGML_TYPE_IQ5_K: break;
author	Kawrakow <iwankawrakow@gmail.com>	2024-10-13 13:34:30 +0300
committer	GitHub <noreply@github.com>	2024-10-13 13:34:30 +0300
commit	910a13409463f7aedb0a92be013a1b9bb50f4859 (patch)
tree	16e13e1fd3010549877408a0a62706b2bc5d5f0c /ggml/src/ggml-quants.c
parent	c15de3654e0002537c8052fd6d52d879e778e88c (diff)