1 files changed, 18 insertions, 2 deletions
diff --git a/ggml-quants.c b/ggml-quants.c
index e540fe4d..0971d696 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -948,7 +948,15 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
         }
     }
 #elif defined(__AVX2__) || defined(__AVX__)
+    block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
+    int nb4 = 4*(nb/4);
+#ifdef __AVX2__
+    const bool pack = true;
+#else
+    const bool pack = false;
+#endif
     for (int i = 0; i < nb; i++) {
+        int i4 = i/4, ir = i%4;
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
         __m256 v1 = _mm256_loadu_ps( x + 8 );
@@ -970,7 +978,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
 
         // Quantize these floats
         const float d = maxScalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
+        if (pack && i < nb4) {
+            y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(d);
+        }
         const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
         const __m256 mul = _mm256_set1_ps( id );
 
@@ -1005,7 +1017,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
         const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
         i0 = _mm256_permutevar8x32_epi32( i0, perm );
 
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+        if (i < nb4) {
+            _mm256_storeu_si256((__m256i *)y4[i4].qs + ir, i0);
+        } else {
+            _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+        }
 #else
         // Since we don't have in AVX some necessary functions,
         // we split the registers in half and call AVX2 analogs from SSE