diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-03-21 07:23:36 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-21 07:23:36 +0100 |
commit | b8d1fac97b756968b86b470d44bb1026ded7157a (patch) | |
tree | 5a5893796293475185e833a787648830a7189450 /ggml/src/iqk/iqk_mul_mat.cpp | |
parent | 127c6ee6493a3084995d754d987f0240ffdffe6a (diff) |
Convert models to row-interleaved quants using the quantize tool (#272)
* Repack a model with the quantize tool
* WIP
* Fixed various issues
As we don't have a way to tell if a repacked quant has been modified,
I had to remove the modification at the expense of a slight decrease
in performance. This affects q8_0_r8, q8_KV_r8, q8_k_r8 on Zen4, and
q4_0_r8 on ARM.
* Create wk_b and wv_b as Q8_0_R8 if the wkv_b type is interleaved
* Fix GCC 13.3 compilation error
* Another one
* Add missing include
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/iqk/iqk_mul_mat.cpp')
-rw-r--r-- | ggml/src/iqk/iqk_mul_mat.cpp | 71 |
1 files changed, 43 insertions, 28 deletions
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 14cc64db..8b6d6b1c 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -4434,14 +4434,17 @@ inline __m256i qx_r8_q8_dot_product(const __m256i * qx, const int8_t * y) { return sumi; } inline __m256i q8_0_r8_dot_product(const uint8_t * x, const int8_t * y, __m256i * qx) { - qx[0] = _mm256_loadu_si256((const __m256i *)x+0); - qx[1] = _mm256_loadu_si256((const __m256i *)x+1); - qx[2] = _mm256_loadu_si256((const __m256i *)x+2); - qx[3] = _mm256_loadu_si256((const __m256i *)x+3); - qx[4] = _mm256_loadu_si256((const __m256i *)x+4); - qx[5] = _mm256_loadu_si256((const __m256i *)x+5); - qx[6] = _mm256_loadu_si256((const __m256i *)x+6); - qx[7] = _mm256_loadu_si256((const __m256i *)x+7); + for (int i = 0; i < 8; ++i) { + qx[i] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)x+i), _mm256_set1_epi8(127)); + } + //qx[0] = _mm256_loadu_si256((const __m256i *)x+0); + //qx[1] = _mm256_loadu_si256((const __m256i *)x+1); + //qx[2] = _mm256_loadu_si256((const __m256i *)x+2); + //qx[3] = _mm256_loadu_si256((const __m256i *)x+3); + //qx[4] = _mm256_loadu_si256((const __m256i *)x+4); + //qx[5] = _mm256_loadu_si256((const __m256i *)x+5); + //qx[6] = _mm256_loadu_si256((const __m256i *)x+6); + //qx[7] = _mm256_loadu_si256((const __m256i *)x+7); return qx_r8_q8_dot_product(qx, y); } template <int nrc_y> @@ -4496,6 +4499,7 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn for (int j = 0; j < 8; ++j) { qx[j] = _mm512_inserti32x8(_mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)q8l[4*ib4+k].qs+j)), _mm256_loadu_si256((const __m256i *)q8h[4*ib4+k].qs+j), 1); + qx[j] = _mm512_add_epi8(qx[j], _mm512_set1_epi8(127)); } for (int iy = 0; iy < nrc_y; ++iy) { auto sumi = qx_r8_q8_dot_product(qx, q8.y[iy][ib4].qs+32*k); @@ -4512,6 +4516,7 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn for (int j = 0; j < 8; ++j) { qx[j] = _mm512_inserti32x8(_mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)q8l[ib].qs+j)), _mm256_loadu_si256((const __m256i *)q8h[ib].qs+j), 1); + qx[j] = _mm512_add_epi8(qx[j], _mm512_set1_epi8(127)); } for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; @@ -6347,6 +6352,11 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn auto s1 = _mm256_sign_epi8(qx[1], qx[1]); auto s2 = _mm256_sign_epi8(qx[2], qx[2]); auto s3 = _mm256_sign_epi8(qx[3], qx[3]); +#else + qx[0] = _mm256_add_epi8(qx[0], _mm256_set1_epi8(127)); + qx[1] = _mm256_add_epi8(qx[1], _mm256_set1_epi8(127)); + qx[2] = _mm256_add_epi8(qx[2], _mm256_set1_epi8(127)); + qx[3] = _mm256_add_epi8(qx[3], _mm256_set1_epi8(127)); #endif for (int iy = 0; iy < nrc_y; ++iy) { auto y128 = _mm_loadu_si128((const __m128i*)q8.y[iy][ibl].qs+ib); @@ -6425,6 +6435,11 @@ static void mul_mat_q8_KV_r8_q8_KV(int n, const void * vx, size_t bx, const Data auto s1 = _mm256_sign_epi8(qx[1], qx[1]); auto s2 = _mm256_sign_epi8(qx[2], qx[2]); auto s3 = _mm256_sign_epi8(qx[3], qx[3]); +#else + qx[0] = _mm256_add_epi8(qx[0], _mm256_set1_epi8(127)); + qx[1] = _mm256_add_epi8(qx[1], _mm256_set1_epi8(127)); + qx[2] = _mm256_add_epi8(qx[2], _mm256_set1_epi8(127)); + qx[3] = _mm256_add_epi8(qx[3], _mm256_set1_epi8(127)); #endif for (int iy = 0; iy < nrc_y; ++iy) { auto y128 = _mm_loadu_si128((const __m128i*)q8y[iy]+ib); @@ -14305,8 +14320,8 @@ struct Q4_0_R8_Dequantizer { float32x4x2_t scales = { vcvt_f32_f16(vget_low_f16(scales16)), vcvt_f32_f16(vget_high_f16(scales16)) }; for (int j = 0; j < 4; ++j) { auto bits = vld1q_u8_x2(iq4[4*ib4+k].qs + 32*j); - //bits.val[0] = veorq_u8(m88, bits.val[0]); - //bits.val[1] = veorq_u8(m88, bits.val[1]); + bits.val[0] = veorq_u8(m88, bits.val[0]); + bits.val[1] = veorq_u8(m88, bits.val[1]); qx[2*j+0] = vshlq_n_u8(bits.val[0], 4); qx[2*j+1] = vandq_u8(bits.val[0], m4); qx[2*j+8] = vshlq_n_u8(bits.val[1], 4); @@ -15305,12 +15320,12 @@ struct HelperQ80R8 : public BaseHelper<step> { m1 = _mm256_unpackhi_epi64(t0, t1); m2 = _mm256_unpacklo_epi64(t2, t3); m3 = _mm256_unpackhi_epi64(t2, t3); -#ifdef HAVE_FANCY_SIMD - m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127)); - m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127)); - m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127)); - m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127)); -#endif +//#ifdef HAVE_FANCY_SIMD +// m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127)); +// m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127)); +// m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127)); +// m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127)); +//#endif _mm256_storeu_si256((__m256i *)y[ib].qs + 0, m0); _mm256_storeu_si256((__m256i *)y[ib].qs + 1, m1); _mm256_storeu_si256((__m256i *)y[ib].qs + 2, m2); @@ -15327,12 +15342,12 @@ struct HelperQ80R8 : public BaseHelper<step> { m1 = _mm256_unpackhi_epi64(t0, t1); m2 = _mm256_unpacklo_epi64(t2, t3); m3 = _mm256_unpackhi_epi64(t2, t3); -#ifdef HAVE_FANCY_SIMD - m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127)); - m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127)); - m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127)); - m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127)); -#endif +//#ifdef HAVE_FANCY_SIMD +// m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127)); +// m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127)); +// m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127)); +// m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127)); +//#endif _mm256_storeu_si256((__m256i *)y[ib].qs + 4, m0); _mm256_storeu_si256((__m256i *)y[ib].qs + 5, m1); _mm256_storeu_si256((__m256i *)y[ib].qs + 6, m2); @@ -15424,12 +15439,12 @@ struct HelperQ8KVR8 : public BaseHelper<step> { m1 = _mm256_unpackhi_epi64(t0, t1); m2 = _mm256_unpacklo_epi64(t2, t3); m3 = _mm256_unpackhi_epi64(t2, t3); -#ifdef HAVE_FANCY_SIMD - m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127)); - m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127)); - m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127)); - m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127)); -#endif +//#ifdef HAVE_FANCY_SIMD +// m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127)); +// m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127)); +// m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127)); +// m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127)); +//#endif _mm256_storeu_si256((__m256i *)y[ix].qs + 4*ib+0, m0); _mm256_storeu_si256((__m256i *)y[ix].qs + 4*ib+1, m1); _mm256_storeu_si256((__m256i *)y[ix].qs + 4*ib+2, m2); |