1 files changed, 71 insertions, 3 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 0968becf..9b39a490 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -119,6 +119,54 @@ void quantize_row_iq1_bn(const float * x, void * y, int64_t k) {
     quantize_iq1_bn(x, y, 1, k, nullptr);
 }
 
+void quantize_row_iq1_tn_ref(const float * x, block_iq1_tn  * y, int64_t k) {
+    quantize_iq1_tn(x, (void *)y, 1, k, nullptr);
+}
+
+void quantize_row_iq1_tn(const float * x, void * y, int64_t k) {
+    quantize_iq1_tn(x, y, 1, k, nullptr);
+}
+
+size_t quantize_iq1_tn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+    GGML_ASSERT(n_per_row >= 2*QK_K); // so we have space for the scale
+    int nblock = n_per_row/QK_IQ1BN;
+    float tmp[QK_IQ1BN];
+    char * qrow = (char *)dst;
+    auto row_size = ggml_row_size(GGML_TYPE_IQ1_TN, n_per_row);
+    IQ1BNQuantizer iq1bn;
+    for (int row = 0; row < nrows; ++row) {
+        float max = fabsf(src[0]);
+        for (int j = 1; j < n_per_row; ++j) max = std::max(max, fabsf(src[j]));
+        if (!(max > 0)) printf("%s: found max = %g?\n", __func__, max);
+        //GGML_ASSERT(max > 0);
+        *(ggml_half *)qrow = GGML_FP32_TO_FP16(max);
+        block_iq1_bn * y = (block_iq1_bn *)(qrow + sizeof(ggml_half));
+        const float * xb = src;
+        for (int ib = 0; ib < nblock; ++ib) {
+            for (int j = 0; j < QK_IQ1BN; ++j) tmp[j] = xb[j] < -0.5f*max ? -1 : xb[j] <= 0.5f*max ? 0 : 1;
+            iq1bn.quantize_one_row_1bn(tmp, y, QK_IQ1BN, imatrix);
+            ++y;
+            xb += QK_IQ1BN;
+        }
+        src  += n_per_row;
+        qrow += row_size;
+    }
+    return nrows*row_size;
+}
+
+void dequantize_row_iq1_tn(const block_iq1_tn * x, float * y, int64_t k) {
+    float scale = GGML_FP16_TO_FP32(*(const ggml_half *)x);
+    const block_iq1_bn * iq1bn = (const block_iq1_bn *)((const char *)x + sizeof(ggml_half));
+    dequantize_row_iq1_bn(iq1bn, y, k);
+    for (int j = 0; j < int(k); ++j) y[j] *= scale;
+}
+
+void vec_dot_iq1_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+    float scale = GGML_FP16_TO_FP32(*(const ggml_half *)vx);
+    ggml_vec_dot_iq1_bn_q8_K64(n, s, bs, (const void *)((const char *)vx + sizeof(ggml_half)), bx, vy, by, nrc);
+    *s *= scale;
+}
+
 void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) {
     assert(k%QK_IQ1BN == 0);
     int nblock = k / QK_IQ1BN;
@@ -331,8 +379,10 @@ void ggml_vec_dot_iq2_bn_q8_K64(int n, float * s, size_t bs, const void * vx, si
 
 void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
 
+    GGML_ASSERT(k >= 8*QK_IQ1BN);
+
     float * dptr = (float *)y;
-    auto qs = (int8_t *)(dptr + 4);
+    auto qs = (int8_t *)(dptr + 8);
 #ifdef __ARM_NEON
     static const uint8_t k_shuffle[16] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
     auto shuffle = vld1q_u8(k_shuffle);
@@ -351,16 +401,22 @@ void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
         vid[i] = vdupq_n_f32(id);
     }
     int8x16x4_t q;
+    int32x4_t qsum = {};
+    const int8x16_t m1 = vdupq_n_s8(1);
     for (int j = 0; j < k; j += 16) {
         for (int i = 0; i < 4; ++i) {
             auto val = vld1q_f32(x + j + 4*i);
             val = vmulq_f32(vid[i], val);
-            q.val[i] = vreinterpretq_s8_s32(vcvtnq_s32_f32(val));
+            auto ival = vcvtnq_s32_f32(val);
+            q.val[i] = vreinterpretq_s8_s32(ival);
         }
         auto qi = vqtbl4q_s8(q, shuffle);
+        qsum = ggml_vdotq_s32(qsum, qi, m1);
         vst1q_s8(qs, qi);
         qs += 16;
     }
+    auto sumf = vmulq_f32(vld1q_f32(dptr), vcvtq_f32_s32(qsum));
+    vst1q_f32(dptr + 4, sumf);
 #elif defined __AVX__
     __m128 max[4] = {};
     __m128 sign_bit = _mm_set1_ps(-0.f);
@@ -381,6 +437,9 @@ void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
         vid[i] = _mm_set1_ps(id);
     }
     __m128i q[4];
+    __m128i sums = _mm_setzero_si128();
+    __m128i m1_8 = _mm_set1_epi8(1);
+    __m128i m1_16 = _mm_set1_epi16(1);
     for (int j = 0; j < k; j += 16) {
         for (int i = 0; i < 4; ++i) {
             auto val = _mm_loadu_ps(x + j + 4*i);
@@ -390,9 +449,13 @@ void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
         auto q1 = _mm_packs_epi32(q[0], q[1]);
         auto q2 = _mm_packs_epi32(q[2], q[3]);
         auto qi = _mm_packs_epi16(q1, q2);
+        auto aux = _mm_maddubs_epi16(m1_8, qi);
+        sums = _mm_add_epi32(sums, _mm_madd_epi16(m1_16, aux));
         _mm_storeu_si128((__m128i *)qs, qi);
         qs += 16;
     }
+    auto minus = _mm_mul_ps(_mm_loadu_ps(dptr), _mm_cvtepi32_ps(sums));
+    _mm_storeu_ps(dptr + 4, minus);
 #else
     float aux[4] = {0.f, 0.f, 0.f, 0.f};
     for (int j = 0; j < k; j += 16) {
@@ -407,11 +470,16 @@ void quantize_row_q8_K64_ref(const float * x, block_q8_K64 * y, int64_t k) {
         dptr[i] = aux[i]/127;
         aux[i] = dptr[i] > 0 ? 1/dptr[i] : 0.f;
     }
+    int32_t sum[4] = {};
     for (int j = 0; j < k; j += 16) {
         for (int i = 0; i < 4; ++i) {
-            for (int l = 0; l < 4; ++l) qs[j+4*i+l] = nearest_int(aux[i]*x[j+4*i+l]);
+            for (int l = 0; l < 4; ++l) {
+                qs[j+4*i+l] = nearest_int(aux[i]*x[j+4*i+l]);
+                sum[i] += qs[j+4*i+l];
+            }
         }
     }
+    for (int i = 0; i < 4; ++i) dptr[4+i] = dptr[i]*sum[i];
 #endif
 }