From ba392802ef41d7e77092a0f7102fdacf73aaeacf Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 2 Oct 2024 18:09:47 +0300 Subject: q6_0: Slightly faster Zen4/AVX2 (#78) * Faster q6_0 on AVX2 PP-512 goes up by 3.4%. * q6_0: this is slightly better --------- Co-authored-by: Iwan Kawrakow --- ggml/src/iqk/iqk_mul_mat.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'ggml/src') diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 0c1c1625..72f1c85b 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -3228,14 +3228,15 @@ struct Q5_1_Dequantizer { return _mm256_or_si256(b4.dequant(x->qs), vqh); } }; -struct Q6_1_Dequantizer { +struct Q6_0_1_Dequantizer { Dequantizer4bit b4; const __m256i mh = _mm256_set1_epi8(0x30); + const __m256i shift1 = _mm256_set_epi64x(0, 2, 0, 4); + const __m256i shift2 = _mm256_set_epi64x(2, 0, 0, 0); inline __m256i dequant(const block_q6_0 * x) const { uint64_t aux64; std::memcpy(&aux64, x->qh, 8); - auto h128 = _mm_set_epi64x(aux64, aux64 << 4); - auto h256 = MM256_SET_M128I(_mm_srli_epi16(h128, 2), h128); - return _mm256_or_si256(b4.dequant(x->qs), _mm256_and_si256(h256, mh)); + auto h256 = _mm256_sllv_epi64(_mm256_set1_epi64x(aux64), shift1); + return _mm256_or_si256(b4.dequant(x->qs), _mm256_and_si256(_mm256_srlv_epi64(h256, shift2), mh)); } }; @@ -3342,10 +3343,10 @@ struct Q5_1_Unpacker final : public Q_Unpacker, Q6_1_Dequantizer> { +struct Q6_0_1_Unpacker final : public Q_Unpacker, Q6_0_1_Dequantizer> { Q6_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} using Sum4T = Sum4TypeQ81; - inline static int block_size() { return QK5_0; } + inline static int block_size() { return QK6_0; } }; // float matrices - we handle f16, bf16 (if native bf16 support is available) and f32, but only to f32 result -- cgit v1.2.3