From cd3d8ae0e719b47fb0ef63b0f7b9e1dacbab7de1 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sat, 8 Jun 2024 09:02:23 +0300 Subject: iqk_mul_mat: use block_q8_1_x4 also for AVX2 Here the performance gain is more significant. E.g., for q4_1, PP-512 becomes 168 t/s up from 137 t/s. Now the performance gap to q4_0 is so significant that I wonder if I should change to using Q8_1 also for the qX_0 legacy quants. --- iqk_mul_mat.cpp | 57 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 23 deletions(-) (limited to 'iqk_mul_mat.cpp') diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp index 48bfe0e0..4d33e2b4 100644 --- a/iqk_mul_mat.cpp +++ b/iqk_mul_mat.cpp @@ -1746,27 +1746,17 @@ struct UnsignedDot { return helper.dot(x, y); } }; -template struct Sum4 { +template struct Sum4 { Dot dot; inline __m256i compute(const __m256i * qx, const Q8 * y) const { - if constexpr (std::is_same_v) { - const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y; - const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0)); - const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1)); - const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2)); - const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3)); - const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1 - const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3 - return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3 - } else { - const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs)); - const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs)); - const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs)); - const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs)); - const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1 - const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3 - return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3 - } + const Q8x4 * y4 = (const Q8x4 *)y; + const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0)); + const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1)); + const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2)); + const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3)); + const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1 + const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3 + return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3 } }; @@ -1797,6 +1787,27 @@ struct ScaleHelperQ_0 { template inline float prepare1(float d, const Q * y) const { return d*prepare1(y); } }; +struct ScaleHelperQ8_1 { + template + inline __m256 prepare4(const Q * y) { + const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y; + return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)y4->d)); + } + template + inline __m256 prepare4(__m256 other_scales, const Q * y) { + return _mm256_mul_ps(other_scales, prepare4(y)); + } + template inline std::pair prepare1(const Q * y) const { + return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m)); + } + template inline std::pair prepare1(const std::pair& dm, const Q * y) const { + return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m)); + } + std::pair inline prepare1(const std::pair& dm, const block_q8_1 * y) const { + return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s)); + } +}; + struct ScaleHelperQ_1 { uint32_t scales8[4]; const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100); @@ -1895,8 +1906,8 @@ using AccumType0 = AccumT; template using AccumType1 = AccumT, nrc_y, is_multiple_of_4>; -using Sum4Type0 = Sum4; -using Sum4Type1 = Sum4; +using Sum4Type0 = Sum4; +using Sum4Type1 = Sum4; template void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) { @@ -1932,11 +1943,11 @@ void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info Q8 q8(info); int nb = n/Unpacker::block_size(); if (nb%4 == 0) { - mul_mat_qX_q8_Helper, ScaleHelperQ_1, block_q8_1, nrc_y>( + mul_mat_qX_q8_Helper, ScaleHelperQ8_1, block_q8_1, nrc_y>( nb, vx, bx, info, q8.y, nrc_x ); } else { - mul_mat_qX_q8_Helper, ScaleHelperQ_1, block_q8_1, nrc_y>( + mul_mat_qX_q8_Helper, ScaleHelperQ8_1, block_q8_1, nrc_y>( nb, vx, bx, info, q8.y, nrc_x ); } -- cgit v1.2.3