diff options
author | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-05 09:38:29 +0300 |
---|---|---|
committer | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-22 12:02:49 +0300 |
commit | dc96d5484fc0d7626181a6f06dd4f6c0e912cb68 (patch) | |
tree | 17e45c739e325ed782829f5b7a6285b54cc7988d | |
parent | cb063a2a20b4c8372cd161a41e3709b519f03caa (diff) |
iqk_mul_mat: experimenting with zen4 (iq2_xs)
-rw-r--r-- | iqk_mul_mat.cpp | 30 |
1 files changed, 22 insertions, 8 deletions
diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp index 1974249c..456ed6d9 100644 --- a/iqk_mul_mat.cpp +++ b/iqk_mul_mat.cpp @@ -1628,7 +1628,18 @@ struct DequantizerIQ2XS final : public BaseDequantizer<block_iq2_xs> { signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, mask), mask); value = _mm256_sign_epi8(value, _mm256_or_si256(signs, mone)); } - inline static void sign_values(const __m256i& data, const Helper& helper, __m256i * values) { + inline void sign_values(const __m256i& data, __m256i * values) const { +#ifdef HAVE_FANCY_SIMD + auto partial_bits = _mm256_cvtepi16_epi8(_mm256_srli_epi16(data, 9)); + auto pcnt = _mm_popcnt_epi8(partial_bits); + auto full_bits = _mm_or_si128(partial_bits, _mm_slli_epi16(_mm_and_si128(pcnt, _mm_set1_epi8(1)), 7)); + const __mmask32 * m32 = (const __mmask32 *)&full_bits; + auto zero = _mm256_setzero_si256(); + values[0] = _mm256_mask_sub_epi8(values[0], m32[0], zero, values[0]); + values[1] = _mm256_mask_sub_epi8(values[1], m32[1], zero, values[1]); + values[2] = _mm256_mask_sub_epi8(values[2], m32[2], zero, values[2]); + values[3] = _mm256_mask_sub_epi8(values[3], m32[3], zero, values[3]); +#else auto psb1 = _mm256_srli_epi16(data, 9); auto psb2 = _mm256_srli_epi16(data, 13); auto psbc = _mm256_xor_si256(psb1, psb2); @@ -1642,33 +1653,36 @@ struct DequantizerIQ2XS final : public BaseDequantizer<block_iq2_xs> { sign_value(full_1, helper.shuff2, helper.mask, helper.mone, values[1]); sign_value(full_2, helper.shuff1, helper.mask, helper.mone, values[2]); sign_value(full_2, helper.shuff2, helper.mask, helper.mone, values[3]); +#endif } - inline static void make4_signed(const Helper& helper, const uint16_t * qs, const __m256i& m511, - const __m256i& min_value, __m256i * values) { + inline void make4_signed(const uint16_t * qs, const __m256i& m511, + const __m256i& min_value, __m256i * values) const { auto q2 = _mm256_loadu_si256((const __m256i *)qs); make4(q2, m511, values); - sign_values(q2, helper, values); + sign_values(q2, values); for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value); } - inline static void make4(const Helper& helper, const uint16_t * qs, const __m256i& m511, __m256i * values, __m256i * q8) { + inline void make4(const uint16_t * qs, const __m256i& m511, __m256i * values, __m256i * q8) const { auto q2 = _mm256_loadu_si256((const __m256i *)qs); make4(q2, m511, values); - sign_values(q2, helper, q8); + sign_values(q2, q8); } inline void prepare(int i, int j) { - make4_signed(helper, x[i].qs + 16*j, idx_mask, min_value, bits.values); + make4_signed(x[i].qs + 16*j, idx_mask, min_value, bits.values); } template <typename Q8> inline void prepare(int i, int j, const Q8& q8, __m256i * q8_quants) { for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k); - make4(helper, x[i].qs + 16*j, idx_mask, bits.values, q8_quants); + make4(x[i].qs + 16*j, idx_mask, bits.values, q8_quants); } constexpr static int minv = 43; SimpleBits bits; +#ifndef HAVE_FANCY_SIMD Helper helper; +#endif const __m256i idx_mask = _mm256_set1_epi16(511); const __m256i min_value = _mm256_set1_epi8(minv); |