iqk_mul_mat: use block_q8_1_x4 also for AVX2

Here the performance gain is more significant. E.g., for q4_1, PP-512 becomes 168 t/s up from 137 t/s. Now the performance gap to q4_0 is so significant that I wonder if I should change to using Q8_1 also for the qX_0 legacy quants.
author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-08 09:02:23 +0300
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-22 12:02:50 +0300
commit: cd3d8ae0e719b47fb0ef63b0f7b9e1dacbab7de1 (patch)
tree: 5ea228ac8f54d743889a2df699af65b396d9048e
parent: 299c7f6e89d2d8c4162be06463b82a07540d5691 (diff)
2 files changed, 57 insertions, 26 deletions
diff --git a/ggml-quants.c b/ggml-quants.c
index 0971d696..061edddc 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1318,7 +1318,15 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
                      wasm_i32x4_extract_lane(accv, 3)));
     }
 #elif defined(__AVX2__) || defined(__AVX__)
+    block_q8_1_x4 * restrict y4 = vy;
+    int nb4 = 4*(nb/4);
+#ifdef __AVX2__
+    const bool pack = true;
+#else
+    const bool pack = false;
+#endif
     for (int i = 0; i < nb; i++) {
+        int i4 = i/4, ir = i%4;
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
         __m256 v1 = _mm256_loadu_ps( x + 8 );
@@ -1340,7 +1348,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
 
         // Quantize these floats
         const float d = max_scalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
+        if (pack && i < nb4) {
+            y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(d);
+        }
         const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
         const __m256 mul = _mm256_set1_ps( id );
 
@@ -1364,7 +1376,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
 
 #if defined(__AVX2__)
         // Compute the sum of the quants and set y[i].s
-        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+        if (i < nb4) {
+            y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+        } else {
+            y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+        }
 
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
@@ -1378,7 +1394,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
         const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
         i0 = _mm256_permutevar8x32_epi32( i0, perm );
 
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+        if (i < nb4) {
+            _mm256_storeu_si256((__m256i *)y4[i4].qs + ir, i0);
+        } else {
+            _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+        }
 #else
         // Since we don't have in AVX some necessary functions,
         // we split the registers in half and call AVX2 analogs from SSE
diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp
index 48bfe0e0..4d33e2b4 100644
--- a/iqk_mul_mat.cpp
+++ b/iqk_mul_mat.cpp
@@ -1746,27 +1746,17 @@ struct UnsignedDot {
         return helper.dot(x, y);
     }
 };
-template <typename Q8, typename Dot> struct Sum4 {
+template <typename Q8, typename Q8x4, typename Dot> struct Sum4 {
     Dot dot;
     inline __m256i compute(const __m256i * qx, const Q8 * y) const {
-        if constexpr (std::is_same_v<Q8, block_q8_0>) {
-            const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y;
-            const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0));
-            const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1));
-            const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2));
-            const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3));
-            const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1));    // 0,0, 1,1, 0,0, 1,1
-            const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3));    // 2,2, 3,3, 2,2, 3,3
-            return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
-        } else {
-            const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
-            const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
-            const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
-            const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
-            const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1));    // 0,0, 1,1, 0,0, 1,1
-            const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3));    // 2,2, 3,3, 2,2, 3,3
-            return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
-        }
+        const Q8x4 * y4 = (const Q8x4 *)y;
+        const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0));
+        const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1));
+        const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2));
+        const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3));
+        const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1));    // 0,0, 1,1, 0,0, 1,1
+        const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3));    // 2,2, 3,3, 2,2, 3,3
+        return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
     }
 };
 
@@ -1797,6 +1787,27 @@ struct ScaleHelperQ_0 {
     template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
 };
 
+struct ScaleHelperQ8_1 {
+    template <typename Q>
+    inline __m256 prepare4(const Q * y) {
+        const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y;
+        return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)y4->d));
+    }
+    template <typename Q>
+    inline __m256 prepare4(__m256 other_scales, const Q * y) {
+        return _mm256_mul_ps(other_scales, prepare4<Q>(y));
+    }
+    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
+        return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
+    }
+    template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
+        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
+    }
+    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
+        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
+    }
+};
+
 struct ScaleHelperQ_1 {
     uint32_t scales8[4];
     const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);
@@ -1895,8 +1906,8 @@ using AccumType0 = AccumT<MinusType0, nrc_y, is_multiple_of_4>;
 template <int nrc_y, bool is_multiple_of_4>
 using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;
 
-using Sum4Type0 = Sum4<block_q8_0, SignedDot>;
-using Sum4Type1 = Sum4<block_q8_1, UnsignedDot>;
+using Sum4Type0 = Sum4<block_q8_0, block_q8_0_x4, SignedDot>;
+using Sum4Type1 = Sum4<block_q8_1, block_q8_1_x4, UnsignedDot>;
 
 template <typename Unpacker, typename Sum4Type, typename AccumType, typename Scales, typename Q8, int nrc_y>
 void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
@@ -1932,11 +1943,11 @@ void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info
     Q8<nrc_y, block_q8_1> q8(info);
     int nb = n/Unpacker::block_size();
     if (nb%4 == 0) {
-        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, true>, ScaleHelperQ_1, block_q8_1, nrc_y>(
+        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, true>, ScaleHelperQ8_1, block_q8_1, nrc_y>(
                 nb, vx, bx, info, q8.y, nrc_x
         );
     } else {
-        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, false>, ScaleHelperQ_1, block_q8_1, nrc_y>(
+        mul_mat_qX_q8_Helper<Unpacker, Sum4Type1, AccumType1<nrc_y, false>, ScaleHelperQ8_1, block_q8_1, nrc_y>(
                 nb, vx, bx, info, q8.y, nrc_x
         );
     }
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-08 09:02:23 +0300
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-22 12:02:50 +0300
commit	cd3d8ae0e719b47fb0ef63b0f7b9e1dacbab7de1 (patch)
tree	5ea228ac8f54d743889a2df699af65b396d9048e
parent	299c7f6e89d2d8c4162be06463b82a07540d5691 (diff)