diff options
author | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-05 08:58:36 +0300 |
---|---|---|
committer | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-22 12:02:49 +0300 |
commit | cb063a2a20b4c8372cd161a41e3709b519f03caa (patch) | |
tree | c1bccb2f44a1afefbbcc09cc34ec99e7a49785d8 | |
parent | 61b8cc1ff6105c6e82e2e71b68a3206bdf84bdfc (diff) |
iqk_mul_mat: experimenting with zen4 (iq3_s and iq2_m)
-rw-r--r-- | iqk_mul_mat.cpp | 313 |
1 files changed, 247 insertions, 66 deletions
diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp index 75221048..1974249c 100644 --- a/iqk_mul_mat.cpp +++ b/iqk_mul_mat.cpp @@ -430,6 +430,58 @@ inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, } } +struct SignHelper { + inline __m256i make_signs(uint32_t sign_bits) const { + auto aux256 = _mm256_set1_epi32(sign_bits); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256, mask1), mask2); + return _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone); + } +// inline __m256i make_signs(const uint16_t * sign_bits) const { +//#ifdef HAVE_FANCY_SIMD +//#else +// return make_signs(sign_bits[0] | (sign_bits[1] << 16)); +//#endif +// } + inline __m256i sign_value(const uint16_t * sign_bits, const __m256i& value) const { +#ifdef HAVE_FANCY_SIMD + const __mmask32 * mask = (const __mmask32 *)sign_bits; + return _mm256_mask_sub_epi8(value, mask[0], _mm256_setzero_si256(), value); +#else + return _mm256_sign_epi8(value, make_signs(sign_bits[0] | (sign_bits[1] << 16))); +#endif + } + inline void sign_4_values(const uint16_t * sign_bits, __m256i * values) const { +#ifdef HAVE_FANCY_SIMD + const __mmask32 * mask = (const __mmask32 *)sign_bits; + values[0] = _mm256_mask_sub_epi8(values[0], mask[0], _mm256_setzero_si256(), values[0]); + values[1] = _mm256_mask_sub_epi8(values[1], mask[1], _mm256_setzero_si256(), values[1]); + values[2] = _mm256_mask_sub_epi8(values[2], mask[2], _mm256_setzero_si256(), values[2]); + values[3] = _mm256_mask_sub_epi8(values[3], mask[3], _mm256_setzero_si256(), values[3]); +#else + auto s128 = _mm_loadu_si128((const __m128i *)sign_bits); + auto s256 = MM256_SET_M128I(s128, s128); + __m256i aux256; + auto shuffle = mask1; + auto step = _mm256_set1_epi8(4); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(s256, shuffle), mask2); shuffle = _mm256_add_epi8(shuffle, step); + values[0] = _mm256_sign_epi8(values[0], _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(s256, shuffle), mask2); shuffle = _mm256_add_epi8(shuffle, step); + values[1] = _mm256_sign_epi8(values[1], _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(s256, shuffle), mask2); shuffle = _mm256_add_epi8(shuffle, step); + values[2] = _mm256_sign_epi8(values[2], _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(s256, shuffle), mask2); shuffle = _mm256_add_epi8(shuffle, step); + values[3] = _mm256_sign_epi8(values[3], _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone)); +#endif + } + const __m256i mask1 = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000); + const __m256i mask2 = _mm256_set1_epi64x(0x8040201008040201ull); + const __m256i mone = _mm256_set1_epi8(1); +}; + +struct SimpleBits { + __m256i values[4]; +}; + #ifdef HAVE_FANCY_SIMD //====================================== Zen4 ================================================== @@ -666,6 +718,125 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> { }; +//struct SimpleBitsAVX512 { +// __m512i values[4]; +//}; +// +//struct SignHelperAVX512 { +// inline void sign_2_values(const uint16_t * sign_bits, __m512i * values) const { +// const __mmask64 * mask = (const __mmask64 *)sign_bits; +// values[0] = _mm512_mask_sub_epi8(values[0], mask[0], _mm512_setzero_si512(), values[0]); +// values[1] = _mm512_mask_sub_epi8(values[1], mask[1], _mm512_setzero_si512(), values[1]); +// //auto minus = _mm512_set1_epi8(-1); +// //auto neg_value = _mm512_sub_epi8(_mm512_xor_si512(values[0], minus), minus); +// //values[0] = _mm512_mask_blend_epi8(mask[0], values[0], neg_value); +// //neg_value = _mm512_sub_epi8(_mm512_xor_si512(values[1], minus), minus); +// //values[1] = _mm512_mask_blend_epi8(mask[1], values[1], neg_value); +// } +//}; +// +//struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> { +// DequantizerIQ3S(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} +// +// constexpr static int num_blocks = 8; +// +// inline __m128i make_scales(int i, float& dd) const { +// dd = GGML_FP16_TO_FP32(x[i].d); +// uint32_t aux32[2]; +// std::memcpy(aux32, x[i].scales, 4); +// aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f; +// aux32[0] &= 0x0f0f0f0f; +// auto scales8 = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i *)aux32), _mm_set1_epi64x(0x0703060205010400)); +// auto scales16 = _mm256_castsi256_si128(_mm256_cvtepi8_epi16(scales8)); +// return _mm_or_si128(_mm_slli_epi16(scales16, 1), _mm_set1_epi16(1)); +// } +// template <typename Q8> +// inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { +// prepare(i); +// auto scales16 = make_scales(i, d); +// scb.accum_mins(scales16, q8, i, -minv*d, accd); +// auto scales256 = MM256_SET_M128I(scales16, scales16); +// auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1); +// scales[0] = _mm512_shuffle_epi8(all_scales, shuffles512[0]); +// scales[1] = _mm512_shuffle_epi8(all_scales, shuffles512[1]); +// } +// +// union index_t { +// __m512i vec; +// uint32_t val[16]; +// }; +// +// inline static __m512i make1(const uint8_t * qs, const uint8_t * qh, const __m512i& idx_shift, const __m512i& idx_mask) { +// auto idx_l = _mm512_cvtepu8_epi32(_mm_loadu_si128((const __m128i *)qs)); +// auto idx_h = _mm512_inserti32x8(_mm512_castsi256_si512(_mm256_set1_epi32(qh[0])), _mm256_set1_epi32(qh[1]), 1); +// idx_h = _mm512_and_si512(_mm512_sllv_epi32(idx_h, idx_shift), idx_mask); +// index_t idx; idx.vec = _mm512_or_si512(idx_l, idx_h); +// return _mm512_set_epi32(iq3s_grid[idx.val[15]], iq3s_grid[idx.val[14]], iq3s_grid[idx.val[13]], iq3s_grid[idx.val[12]], +// iq3s_grid[idx.val[11]], iq3s_grid[idx.val[10]], iq3s_grid[idx.val[ 9]], iq3s_grid[idx.val[ 8]], +// iq3s_grid[idx.val[ 7]], iq3s_grid[idx.val[ 6]], iq3s_grid[idx.val[ 5]], iq3s_grid[idx.val[ 4]], +// iq3s_grid[idx.val[ 3]], iq3s_grid[idx.val[ 2]], iq3s_grid[idx.val[ 1]], iq3s_grid[idx.val[ 0]]); +// ////index_t idx1, idx2; +// ////auto idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)qs)); +// ////auto idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[0]), idx_shift), idx_mask); +// ////idx1.vec = _mm256_or_si256(idx_h, idx_l); +// ////idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)(qs + 8))); +// ////idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[1]), idx_shift), idx_mask); +// ////idx2.vec = _mm256_or_si256(idx_h, idx_l); +// ////return _mm512_set_epi32(iq3s_grid[idx2.val[7]], iq3s_grid[idx2.val[6]], iq3s_grid[idx2.val[5]], iq3s_grid[idx2.val[4]], +// //// iq3s_grid[idx2.val[3]], iq3s_grid[idx2.val[2]], iq3s_grid[idx2.val[1]], iq3s_grid[idx2.val[0]], +// //// iq3s_grid[idx1.val[7]], iq3s_grid[idx1.val[6]], iq3s_grid[idx1.val[5]], iq3s_grid[idx1.val[4]], +// //// iq3s_grid[idx1.val[3]], iq3s_grid[idx1.val[2]], iq3s_grid[idx1.val[1]], iq3s_grid[idx1.val[0]]); +// //////return _mm512_inserti32x8(value, val, 1); +// //index_t idx; +// //auto idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)qs)); +// //auto idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[0]), idx_shift), idx_mask); +// //idx.vec = _mm256_or_si256(idx_h, idx_l); +// //auto val = _mm256_set_epi32(iq3s_grid[idx.val[7]], iq3s_grid[idx.val[6]], iq3s_grid[idx.val[5]], iq3s_grid[idx.val[4]], +// // iq3s_grid[idx.val[3]], iq3s_grid[idx.val[2]], iq3s_grid[idx.val[1]], iq3s_grid[idx.val[0]]); +// //auto value = _mm512_inserti32x8(_mm512_setzero_si512(), val, 0); +// //idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)(qs + 8))); +// //idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[1]), idx_shift), idx_mask); +// //idx.vec = _mm256_or_si256(idx_h, idx_l); +// //val = _mm256_set_epi32(iq3s_grid[idx.val[7]], iq3s_grid[idx.val[6]], iq3s_grid[idx.val[5]], iq3s_grid[idx.val[4]], +// // iq3s_grid[idx.val[3]], iq3s_grid[idx.val[2]], iq3s_grid[idx.val[1]], iq3s_grid[idx.val[0]]); +// //return _mm512_inserti32x8(value, val, 1); +// } +// +// inline void prepare(int i) { +// prepare_unsigned(i); +// auto signs = (const uint16_t *)x[i].signs; +// sh.sign_2_values(signs+0, bits.values+0); +// sh.sign_2_values(signs+8, bits.values+2); +// auto min_value = _mm512_set1_epi8(minv); +// for (int k = 0; k < 4; ++k) bits.values[k] = _mm512_add_epi8(bits.values[k], min_value); +// } +// +// inline void prepare_unsigned(int i) { +// auto qs = x[i].qs; +// auto qh = x[i].qh; +// bits.values[0] = make1(qs+ 0, qh+0, idx_shift, idx_mask); +// bits.values[1] = make1(qs+16, qh+2, idx_shift, idx_mask); +// bits.values[2] = make1(qs+32, qh+4, idx_shift, idx_mask); +// bits.values[3] = make1(qs+48, qh+6, idx_shift, idx_mask); +// } +// +// constexpr static int minv = 16; +// +// SimpleBitsAVX512 bits; +// SignHelperAVX512 sh; +// Scales8KBase scb; +// const __m512i idx_shift = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); +// const __m512i idx_mask = _mm512_set1_epi32(256); +// //const __m256i min_value = _mm256_set1_epi8(minv); +// const __m512i shuffles512[2] = { +// _mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302, +// 0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100), +// _mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, +// 0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908) +// }; +// +//}; + template <typename Dequantizer, int nrc_y> static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { assert(n % QK_K == 0); @@ -1011,6 +1182,7 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf } } + #endif // Zen4 or vanilla AVX2 template <typename Bits> @@ -1129,41 +1301,49 @@ static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataIn } } -struct SimpleBits { - __m256i values[4]; -}; - -struct SignHelper { - inline __m256i make_signs(uint32_t sign_bits) const { - auto aux256 = _mm256_set1_epi32(sign_bits); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256, mask1), mask2); - return _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone); - } - inline __m256i make_signs(const uint16_t * sign_bits) const { - return make_signs(sign_bits[0] | (sign_bits[1] << 16)); - //auto aux256 = _mm256_set1_epi32(sign_bits[0] | (sign_bits[1] << 16)); - //aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256, mask1), mask2); - //return _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone); - } - inline void sign_4_values(const uint16_t * sign_bits, __m256i * values) const { - auto s128 = _mm_loadu_si128((const __m128i *)sign_bits); - auto s256 = MM256_SET_M128I(s128, s128); - __m256i aux256; - auto shuffle = mask1; - auto step = _mm256_set1_epi8(4); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(s256, shuffle), mask2); shuffle = _mm256_add_epi8(shuffle, step); - values[0] = _mm256_sign_epi8(values[0], _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(s256, shuffle), mask2); shuffle = _mm256_add_epi8(shuffle, step); - values[1] = _mm256_sign_epi8(values[1], _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(s256, shuffle), mask2); shuffle = _mm256_add_epi8(shuffle, step); - values[2] = _mm256_sign_epi8(values[2], _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(s256, shuffle), mask2); shuffle = _mm256_add_epi8(shuffle, step); - values[3] = _mm256_sign_epi8(values[3], _mm256_or_si256(_mm256_cmpeq_epi8(aux256, mask2), mone)); +//#ifdef HAVE_FANCY_SIMD +// Strangely enough, the following implementation makes PP ~6% slower and TG ~6% faster +// compared to the vanilla AVX2 version below. +//struct IndexHelperIQ3S { +// union index_t { +// __m256i vec; +// uint16_t val[16]; +// }; +// inline void make2(const uint8_t * qs, const uint8_t * qh, __m256i * values) const { +// auto idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); +// const __mmask16 * m16 = (const __mmask16 *)qh; +// index_t idx; +// idx.vec = _mm256_mask_add_epi16(idx_l, m16[0], idx_l, offset); +// values[0] = _mm256_set_epi32(iq3s_grid[idx.val[ 7]], iq3s_grid[idx.val[ 6]], iq3s_grid[idx.val[ 5]], iq3s_grid[idx.val[ 4]], +// iq3s_grid[idx.val[ 3]], iq3s_grid[idx.val[ 2]], iq3s_grid[idx.val[ 1]], iq3s_grid[idx.val[ 0]]); +// values[1] = _mm256_set_epi32(iq3s_grid[idx.val[15]], iq3s_grid[idx.val[14]], iq3s_grid[idx.val[13]], iq3s_grid[idx.val[12]], +// iq3s_grid[idx.val[11]], iq3s_grid[idx.val[10]], iq3s_grid[idx.val[ 9]], iq3s_grid[idx.val[ 8]]); +// } +// const __m256i offset = _mm256_set1_epi16(256); +//}; +//#else +struct IndexHelperIQ3S { + union index_t { + __m256i vec; + uint32_t val[8]; + }; + inline void make2(const uint8_t * qs, const uint8_t * qh, __m256i * values) const { + index_t idx; + auto idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)qs)); + auto idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[0]), idx_shift), idx_mask); + idx.vec = _mm256_or_si256(idx_h, idx_l); + values[0] = _mm256_set_epi32(iq3s_grid[idx.val[7]], iq3s_grid[idx.val[6]], iq3s_grid[idx.val[5]], iq3s_grid[idx.val[4]], + iq3s_grid[idx.val[3]], iq3s_grid[idx.val[2]], iq3s_grid[idx.val[1]], iq3s_grid[idx.val[0]]); + idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)(qs+8))); + idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[1]), idx_shift), idx_mask); + idx.vec = _mm256_or_si256(idx_h, idx_l); + values[1] = _mm256_set_epi32(iq3s_grid[idx.val[7]], iq3s_grid[idx.val[6]], iq3s_grid[idx.val[5]], iq3s_grid[idx.val[4]], + iq3s_grid[idx.val[3]], iq3s_grid[idx.val[2]], iq3s_grid[idx.val[1]], iq3s_grid[idx.val[0]]); } - const __m256i mask1 = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000); - const __m256i mask2 = _mm256_set1_epi64x(0x8040201008040201ull); - const __m256i mone = _mm256_set1_epi8(1); + const __m256i idx_mask = _mm256_set1_epi32(256); + const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); }; +//#endif struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> { DequantizerIQ3S(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} @@ -1191,25 +1371,6 @@ struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> { scales[0] = MM256_SET_M128I(scales16, scales16); } - union index_t { - __m256i vec; - uint32_t val[8]; - }; - - inline static void make2(const uint8_t * qs, const uint8_t * qh, __m256i * values, const __m256i& idx_shift, const __m256i& idx_mask) { - index_t idx; - auto idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)qs)); - auto idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[0]), idx_shift), idx_mask); - idx.vec = _mm256_or_si256(idx_h, idx_l); - values[0] = _mm256_set_epi32(iq3s_grid[idx.val[7]], iq3s_grid[idx.val[6]], iq3s_grid[idx.val[5]], iq3s_grid[idx.val[4]], - iq3s_grid[idx.val[3]], iq3s_grid[idx.val[2]], iq3s_grid[idx.val[1]], iq3s_grid[idx.val[0]]); - idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)(qs + 8))); - idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[1]), idx_shift), idx_mask); - idx.vec = _mm256_or_si256(idx_h, idx_l); - values[1] = _mm256_set_epi32(iq3s_grid[idx.val[7]], iq3s_grid[idx.val[6]], iq3s_grid[idx.val[5]], iq3s_grid[idx.val[4]], - iq3s_grid[idx.val[3]], iq3s_grid[idx.val[2]], iq3s_grid[idx.val[1]], iq3s_grid[idx.val[0]]); - } - inline void prepare(int i, int j) { prepare_unsigned(i, j); sh.sign_4_values((const uint16_t *)x[i].signs + 8*j, bits.values); @@ -1225,8 +1386,8 @@ struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> { inline void prepare_unsigned(int i, int j) { auto qs = x[i].qs + 32*j; auto qh = x[i].qh + 4*j; - make2(qs+ 0, qh+0, bits.values+0, idx_shift, idx_mask); - make2(qs+16, qh+2, bits.values+2, idx_shift, idx_mask); + helper.make2(qs+ 0, qh+0, bits.values+0); + helper.make2(qs+16, qh+2, bits.values+2); } constexpr static int minv = 16; @@ -1234,8 +1395,7 @@ struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> { SimpleBits bits; SignHelper sh; Scales8KBase scb; - const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); - const __m256i idx_mask = _mm256_set1_epi32(256); + IndexHelperIQ3S helper; const __m256i min_value = _mm256_set1_epi8(minv); }; @@ -1366,8 +1526,8 @@ struct DequantizerIQ2S final : public BaseDequantizer<block_iq2_s> { inline static void make2_signed(const SignHelper& sh, const uint8_t * qs, const uint8_t * qh, const uint16_t * sidx, const __m256i& idx_shift, const __m256i& idx_mask, const __m256i& min_value, __m256i * values) { make2(qs, qh, idx_shift, idx_mask, values); - values[0] = _mm256_add_epi8(_mm256_sign_epi8(values[0], sh.make_signs(sidx[0] | (sidx[1] << 16))), min_value); - values[1] = _mm256_add_epi8(_mm256_sign_epi8(values[1], sh.make_signs(sidx[2] | (sidx[3] << 16))), min_value); + values[0] = _mm256_add_epi8(sh.sign_value(sidx+0, values[0]), min_value); + values[1] = _mm256_add_epi8(sh.sign_value(sidx+2, values[1]), min_value); } inline void prepare(int i, int j) { @@ -1962,17 +2122,38 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) { m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>; m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>; } +// else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ3S>) { +//#ifdef HAVE_FANCY_SIMD +// m.funcs[0] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 1>; +// m.funcs[1] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 2>; +// m.funcs[2] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 3>; +// m.funcs[3] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 4>; +// m.funcs[4] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 5>; +// m.funcs[5] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 6>; +// m.funcs[6] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 7>; +// m.funcs[7] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 8>; +//#else +// m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>; +// m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>; +// m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>; +// m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>; +// m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>; +// m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>; +// m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>; +// m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>; +//#endif +// } else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ3S> || std::is_same_v<Dequantizer, DequantizerIQ3XXS> || std::is_same_v<Dequantizer, DequantizerIQ2S> || std::is_same_v<Dequantizer, DequantizerIQ2XS> || std::is_same_v<Dequantizer, DequantizerIQ2XXS>) { - m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>; - m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>; - m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>; - m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>; - m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>; - m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>; - m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>; - m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>; + m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>; + m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>; + m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>; + m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>; + m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>; + m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>; + m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>; + m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>; } else { #ifdef HAVE_FANCY_SIMD |