Adding Q6_0 (#77)

* Adding q6_0 - basics + AVX2/Zen4 working * Adding q6_0: CUDA dequantize works, but not mmvq * Adding q6_0: CUDA mmvq works * Adding q6_0: CUDA cpy, so Q6_0 can be used for KV-cache * Add q6_0 to CPU flash attention Disappointing result: for LlaMA-3.2-1B, q6_0 K- and V-cache gives about the same PPL as q8_0 K-cache and q4_0 V-cache, while needing the exact same RAM. I.e., what was the point? * q6_0: slightly better kv-cache result Better than q8_0+q4_0, but not as good as q8_0+iq4_nl * q6_0: works on ARM_NEON * q6_0: dequantize works on Metal, but not vector dot product * q6_0: it now works on Metal Outperforms q5_0 by a significant margin. E.g. | model | size | params | backend | ngl | threads | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------------: | ---------------: | | llama 8B Q6_0 | 6.08 GiB | 8.03 B | Metal | 100 | 4 | tg128 | 44.02 ± 0.08 | | llama 8B Q5_0 | 5.21 GiB | 8.03 B | Metal | 100 | 4 | tg128 | 40.13 ± 0.12 | | llama 8B Q6_0 | 6.08 GiB | 8.03 B | Metal | 100 | 4 | pp512 | 500.55 ± 0.32 | | llama 8B Q5_0 | 5.21 GiB | 8.03 B | Metal | 100 | 4 | pp512 | 448.02 ± 0.27 | * q6_0: can now be used for kv-cache on Metal --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2024-10-02 15:22:13 +0300
committer: GitHub <noreply@github.com> 2024-10-02 15:22:13 +0300
commit: cce49832c1b81b4e535e78ff308417ef3a386b18 (patch)
tree: 33b10f9344f4656d58cd3ea068233ba75888498d /ggml/src/ggml-quants.c
parent: d6909ed6f00f91f20c9ef628085a1a1a6a55c453 (diff)
1 files changed, 139 insertions, 0 deletions
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index bef2f73e..f5fff22e 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -848,6 +848,59 @@ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q5_1_ref(x, y, k);
 }
 
+void quantize_row_q6_0_ref(const float * restrict x, block_q6_0 * restrict y, int64_t k) {
+    static const int qk = QK6_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -32;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        //y[i].d = GGML_FP32_TO_FP16(d);
+        memset(y[i].qh, 0, qk/4);
+
+        float sumqx = 0, sumq2 = 0;
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+            const float w0 = x0*x0;
+            const float w1 = x1*x1;
+
+            const uint8_t xi0 = MIN(63, (int8_t)(x0 + 32.5f));
+            const uint8_t xi1 = MIN(63, (int8_t)(x1 + 32.5f));
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2);
+            y[i].qh[j%(qk/4)] |= (h << 4*(j/(qk/4)));
+
+            const float q0 = (float)xi0 - 32.f;
+            const float q1 = (float)xi1 - 32.f;
+            sumqx += w0*x[i*qk + j]*q0 + w1*x[i*qk + qk/2 + j]*q1;
+            sumq2 += w0*q0*q0 + w1*q1*q1;
+        }
+        y[i].d = sumq2 > 0 ? GGML_FP32_TO_FP16(sumqx/sumq2) : GGML_FP32_TO_FP16(d);
+    }
+}
+
+void quantize_row_q6_0(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q6_0_ref(x, y, k);
+}
+
 // reference implementation for deterministic creation of model files
 void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
     assert(k % QK8_0 == 0);
@@ -1691,6 +1744,28 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6
     }
 }
 
+void dequantize_row_q6_0(const block_q6_0 * restrict x, float * restrict y, int64_t k) {
+    static const int qk = QK6_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t h = x[i].qh[j%(qk/4)] >> 4*(j/(qk/4));
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | ((h << 4) & 0x30)) - 32;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | ((h << 2) & 0x30)) - 32;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
 void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
     static const int qk = QK8_0;
 
@@ -3429,6 +3504,54 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
+static void quantize_row_q6_0_impl(const float * restrict x, block_q6_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+    static_assert(QK6_0 == 32, "QK6_0 must be 32");
+
+    float weight[QK6_0];
+    int8_t L[QK6_0];
+
+    float sigma2 = 0;
+    if (quant_weights) {
+        float sum_x2 = 0;
+        for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+        sigma2 = sum_x2/n_per_row;
+    }
+
+    const int64_t nb = n_per_row/QK6_0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK6_0 * ib;
+        if (quant_weights) {
+            const float * qw = quant_weights + QK6_0 * ib;
+            for (int j = 0; j < QK6_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        } else {
+            for (int j = 0; j < QK6_0; ++j) weight[j] = xb[j]*xb[j];
+        }
+        float d = make_qx_quants(QK6_0, 32, xb, L, 1, weight);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+
+        memset(y[ib].qh, 0, QK6_0/4);
+
+        for (int j = 0; j < 16; ++j) {
+            const uint8_t xi0 = L[j];
+            const uint8_t xi1 = L[j+16];
+            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+            const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2);
+            y[ib].qh[j%8] |= (h << 4*(j/8));
+        }
+    }
+}
+
+size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    size_t row_size = ggml_row_size(GGML_TYPE_Q6_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_q6_0_impl(src, (block_q6_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
 size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
@@ -5383,6 +5506,21 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 }
 
+void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+#if GGML_USE_IQK_MULMAT
+#ifdef __AVX2__
+    const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1;
+#else
+    const enum ggml_type vec_dot_type = GGML_TYPE_Q8_0;
+#endif
+    if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q6_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) {
+        return;
+    }
+#endif
+    // TODO
+    *s = 0;
+}
+
 void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
 #if GGML_USE_IQK_MULMAT
     if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q8_0, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) {
@@ -15020,6 +15158,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
             {
                 VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
             } break;
+        case GGML_TYPE_Q6_0: break;
         case GGML_TYPE_IQ2_K: break;
         case GGML_TYPE_IQ3_K: break;
         case GGML_TYPE_IQ4_K: break;
author	Kawrakow <iwankawrakow@gmail.com>	2024-10-02 15:22:13 +0300
committer	GitHub <noreply@github.com>	2024-10-02 15:22:13 +0300
commit	cce49832c1b81b4e535e78ff308417ef3a386b18 (patch)
tree	33b10f9344f4656d58cd3ea068233ba75888498d /ggml/src/ggml-quants.c
parent	d6909ed6f00f91f20c9ef628085a1a1a6a55c453 (diff)