iqk_mul_mat for llama.cpp

author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-05-27 09:51:08 +0200
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-22 12:02:49 +0300
commit: 19c578b413fb53f432319a99fbd658ed30faf966 (patch)
tree: 68aa0ec1ecaf75ebd1340b930f2e5e2ec1951a31 /ggml-quants.c
parent: c5a8d4b749352645afd4c024f85d6eca2ca72c6d (diff)
1 files changed, 43 insertions, 11 deletions
diff --git a/ggml-quants.c b/ggml-quants.c
index 0eb52e48..e540fe4d 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -871,7 +871,10 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
     block_q8_0 * restrict y = vy;
 
 #if defined(__ARM_NEON)
+    block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
+    int nb4 = 4*(nb/4);
     for (int i = 0; i < nb; i++) {
+        int i4 = i/4, ir = i%4;
         float32x4_t srcv [8];
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
@@ -888,16 +891,27 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
         const float d = amax / ((1 << 7) - 1);
         const float id = d ? 1.0f/d : 0.0f;
 
-        y[i].d = GGML_FP32_TO_FP16(d);
+        if (i < nb4) {
+            y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(d);
+        }
 
         for (int j = 0; j < 8; j++) {
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            if (i < nb4) {
+                y4[i4].qs[32*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
+                y4[i4].qs[32*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
+                y4[i4].qs[32*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
+                y4[i4].qs[32*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
+            } else {
+                y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            }
         }
     }
 #elif defined(__wasm_simd128__)
@@ -1191,7 +1205,10 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
     block_q8_1 * restrict y = vy;
 
 #if defined(__ARM_NEON)
+    block_q8_1_x4 * restrict y4 = vy;
+    int nb4 = 4*(nb/4);
     for (int i = 0; i < nb; i++) {
+        int i4 = i/4, ir = i%4;
         float32x4_t srcv [8];
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
@@ -1208,7 +1225,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
         const float d = amax / ((1 << 7) - 1);
         const float id = d ? 1.0f/d : 0.0f;
 
-        y[i].d = GGML_FP32_TO_FP16(d);
+        if (i < nb4) {
+            y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(d);
+        }
 
         int32x4_t accv = vdupq_n_s32(0);
 
@@ -1216,15 +1237,26 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            if (i < nb4) {
+                y4[i4].qs[QK8_1*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
+                y4[i4].qs[QK8_1*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
+                y4[i4].qs[QK8_1*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
+                y4[i4].qs[QK8_1*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
+            } else {
+                y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            }
 
             accv = vaddq_s32(accv, vi);
         }
 
-        y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        if (i < nb4) {
+            y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        } else {
+            y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        }
     }
 #elif defined(__wasm_simd128__)
     for (int i = 0; i < nb; i++) {
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-05-27 09:51:08 +0200
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-22 12:02:49 +0300
commit	19c578b413fb53f432319a99fbd658ed30faf966 (patch)
tree	68aa0ec1ecaf75ebd1340b930f2e5e2ec1951a31 /ggml-quants.c
parent	c5a8d4b749352645afd4c024f85d6eca2ca72c6d (diff)