Adding IQ2_TN for use with ternary models (#13)

* iq2_tn: TriLM specific 2.0625 bpw quantization Quantize/dequantize/scale dot product. I get 46 t/s for the TriLM-3.9B with any SIMD! Finally a compiler doing a decent job auto-vectorizing the scalar implementation. * iq2_tn: AVX512 Just reusing the k-quants template gets us to PP-512 = 376 t/s, TG-128 = 47.6 t/s for TriLM-3.9B. * iq2_tn: AVX512 With this tweak we get to PP-512 = 431 t/s. * iq2_tn: AVX512 With this tweak we get TG-128 = 19.58 / 35.18 t/s for 1 / 2 threads. At 4 threads we saturate at 48.41 t/s, and then performance slowly degrades with increasing number of threads. * iq2_tn: AVX2 PP512 = 440 t/s on the Ryzen-5975WX. We should be able to do better. * iq2_tn: initial NEON version * iq2_tn: NEON For TriLM-3.9B running on the M2-Max we get PP-512 = 193.5 t/s, TG-128 = 75.5 t/s. This is in line with what we have for iq2_bn ant 3.3B Bitnet. * iq2_tn: Metal For TriLM-3.9B on a 30-core M2-Max we get PP-512 = 890 t/s, TG-128 = 98.5 t/s. * iq2_tn: CUDA For TriLM-3.9B running on RTX-4080 we get PP-512 = 9936 t/s, TG-128 = 299.2 t/s. * iq2_tn: AVX2 PP improvement We now get PP-512 = 490.73 t/s for TriLM-3.9B on the Ryzen-5975WX. We have PP-512 = 636.61 t/s for Bintnet-3B quantized with iq2_bn. Bintnet-3B is actually 3.4B, TriLM-3.9B is 3.99B, so we would expect 3.43/3.99 * 636 = 546 t/s, so it seems we still have something that is not quite optimal in iq2_tn. * iq2_tn: small NEON improvement For TriLM-3.9B we now get PP-512 = 206.6 t/s and TG-128 = 76.4 t/s. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <48489457+ikawrakow@users.noreply.github.com> 2024-08-07 07:56:09 +0200
committer: GitHub <noreply@github.com> 2024-08-07 07:56:09 +0200
commit: a9f302ebe2373321c12b01d8760904901aa064a4 (patch)
tree: 7953bbff2ebd6bf9130cea52d17995aea3cd65d5 /ggml/src/iqk/iqk_quantize.cpp
parent: b409c153636d27473970abd3a9c9400b6287d400 (diff)
1 files changed, 107 insertions, 0 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index c840fabf..1cba1532 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -1514,3 +1514,110 @@ size_t quantize_iq5_k(const float * src, void * dst, int64_t nrows, int64_t n_pe
     }
     return nrows * nblock * sizeof(block_iq5_k);
 }
+
+//
+//  ========================== IQ2_TN
+//
+
+void quantize_row_iq2_tn_ref(const float * x, block_iq2_tn  * y, int64_t k) {
+    GGML_ASSERT(k%QK_K == 0);
+
+    int nb = k/QK_K;
+
+    auto quantize = [] (float xmax, float x) {
+        return x < -0.5f*xmax ? 0 : x < 0.5f*xmax ? 1 : 2;
+    };
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        auto xb = x + QK_K*ibl;
+        float max = xb[0];
+        for (int j = 0; j < QK_K; ++j) {
+            float ax = fabsf(xb[j]);
+            max = std::max(ax, max);
+        }
+        y[ibl].d = GGML_FP32_TO_FP16(max);
+        auto qs = y[ibl].qs;
+        for (int l = 0; l < QK_K/128; ++l) {
+            for (int j = 0; j < 32; ++j) {
+                qs[j] = quantize(max, xb[j]) | (quantize(max, xb[j+32]) << 2) | (quantize(max, xb[j+64]) << 4) | (quantize(max, xb[j+96]) << 6);
+            }
+            xb += 128;
+            qs += 32;
+        }
+    }
+}
+
+void   quantize_row_iq2_tn(const float * x, void * y, int64_t k) {
+    quantize_row_iq2_tn_ref(x, (block_iq2_tn *)y, k);
+}
+
+size_t quantize_iq2_tn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * /*imatrix*/) {
+    auto row_size = ggml_row_size(GGML_TYPE_IQ2_TN, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrows; ++row) {
+        quantize_row_iq2_tn_ref(src, (block_iq2_tn *)qrow, n_per_row);
+        qrow += row_size;
+        src  += n_per_row;
+    }
+    return row_size*nrows;
+}
+
+void dequantize_row_iq2_tn(const block_iq2_tn * x, float * y, int64_t k) {
+    GGML_ASSERT(k%QK_K == 0);
+    int nb = k/QK_K;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        float d = GGML_FP16_TO_FP32(x[ibl].d);
+        auto qs = x[ibl].qs;
+        for (int l = 0; l < QK_K/128; ++l) {
+            for (int j = 0; j < 32; ++j) {
+                y[j+ 0] = d*((qs[j] >> 0) & 3) - d;
+                y[j+32] = d*((qs[j] >> 2) & 3) - d;
+                y[j+64] = d*((qs[j] >> 4) & 3) - d;
+                y[j+96] = d*((qs[j] >> 6) & 3) - d;
+            }
+            y  += 128;
+            qs += 32;
+        }
+    }
+}
+
+void   vec_dot_iq2_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+
+    if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_TN, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
+        return;
+    }
+
+    const int nb = n / QK_K;
+
+    const block_iq2_tn * x = (const block_iq2_tn *)vx;
+    const block_q8_K   * y = (const block_q8_K  *)vy;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; i++) {
+        float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        auto qs = x[i].qs;
+        auto q8 = y[i].qs;
+        int sumi1 = 0, sumi2 = 0, sumi3 = 0,sumi4 = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi1 -= y[i].bsums[j];
+        for (int l = 0; l < QK_K/128; ++l) {
+            for (int j = 0; j < 32; ++j) {
+                sumi1 += q8[j+ 0] * (qs[j] & 0x03);
+                sumi2 += q8[j+32] * (qs[j] & 0x0c);
+                sumi3 += q8[j+64] * (qs[j] & 0x30);
+                sumi4 += q8[j+96] * (qs[j] & 0xc0);
+            }
+            q8 += 128;
+            qs += 32;
+        }
+        sumf += d * (sumi1 + 0.25f*sumi2 + 0.0625f*sumi3 + 0.015625f*sumi4);
+    }
+    *s = sumf;
+}
+
author	Kawrakow <48489457+ikawrakow@users.noreply.github.com>	2024-08-07 07:56:09 +0200
committer	GitHub <noreply@github.com>	2024-08-07 07:56:09 +0200
commit	a9f302ebe2373321c12b01d8760904901aa064a4 (patch)
tree	7953bbff2ebd6bf9130cea52d17995aea3cd65d5 /ggml/src/iqk/iqk_quantize.cpp
parent	b409c153636d27473970abd3a9c9400b6287d400 (diff)