diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-01-27 16:50:07 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-01-27 16:50:07 +0200 |
commit | d9c4ea48d1e41d8f7215ff1c094d75e7229b65e2 (patch) | |
tree | ef32b816f715ae7217da01217e506c7ed31b537e /ggml/src/iqk/iqk_quantize.cpp | |
parent | 814d3e054cc5ac483ddb9933aa13ba385e979b68 (diff) |
Interleave 8 rows (Q8_0, IQ4_XS) (#178)
* Try interleaving 8 rows for iq4_xs
On Zen4, PP-512 goes up from ~260 t/s to 288 t/s for L3-8B.
TG-128 reaches max. performance at 2 threads and is slightly
higher than 4 interleaved rows (14.48 t/s vs 13.11 t/s @ 2 threads
and 14/28 t/s @ 4 threads).
* Try interleaving 8 iq4_xs rows
It is also faster on AVX2.
This is the NEON implementation. It is tiny bit faster than
4 interleaved rows (~0.5%).
So, this looks like a winner given the Zen4/AVX2 improvement
without associated NEON egression.
* Cleanup
* 8-rows interleaved q8_0 (AVX2)
* 8-rows interleaved q8_0 (Zen4)
* 8-rows interleaved q8_0 (Zen4) - slightly better
PP-512 is now 284 t/s compared to 257 t/s for 4-rows interleaved.
TG-128 reaches peak of 8.16 t/s at just 2 threads compared
to 7.95 t/s @ 4 threads before.
* 8-rows interleaved q8_0 (NEON)
PP-512 is slightly better (138 t/s vs 132.5 t/s), TG-128 is about the
same.
* FA: repack Q8_0 to Q8_0_R8
* Remove special purpose mul_mat_q8_0_r4_q8_1_128 (Zen4)
* FA: repack Q8_0 to Q8_0_R8 (NEON)
Very slightly faster than the general purpose gemm, slightly
slower than the D = 128 special case gemm mul_mat_q8_0_r4_q8_0_128.
Still removing mul_mat_q8_0_r4_q8_0_128 as we simply don't have
enough vector registers to hold 8 interleaved rows, so there is
no point to have the special purpose implementation.
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 150 |
1 files changed, 67 insertions, 83 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 221bc48c..59a36c5c 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -3709,63 +3709,63 @@ void vec_dot_q4_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b // // ========================================= q8_0_r4 // -void quantize_row_q8_0_r4_ref(const float * x, block_q8_0_x4 * y, int64_t k) { +void quantize_row_q8_0_r4_ref(const float * x, block_q8_0_r8 * y, int64_t k) { // we assume we are called with 4 rows - quantize_q8_0_r4(x, (void *)y, 4, k/4, nullptr); + quantize_q8_0_r4(x, (void *)y, 8, k/8, nullptr); } void quantize_row_q8_0_r4(const float * x, void * y, int64_t k) { // we assume we are called with 4 rows - quantize_q8_0_r4(x, y, 4, k/4, nullptr); + quantize_q8_0_r4(x, y, 8, k/8, nullptr); } -static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_x4 * y) { - GGML_ASSERT(nrows%4 == 0); +static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_r8 * y) { + GGML_ASSERT(nrows%8 == 0); GGML_ASSERT(n_per_row%QK8_0 == 0); int nblock = n_per_row/QK8_0; - const block_q8_0 * x4[4]; - for (int row = 0; row < nrows; row += 4) { - for (int k = 0; k < 4; ++k) x4[k] = x + nblock*k; + const block_q8_0 * x8[8]; + for (int row = 0; row < nrows; row += 8) { + for (int k = 0; k < 8; ++k) x8[k] = x + nblock*k; for (int ib = 0; ib < nblock; ++ib) { - for (int k = 0; k < 4; ++k) y[ib].d[k] = x4[k][ib].d; + for (int k = 0; k < 8; ++k) y[ib].d[k] = x8[k][ib].d; for (int l = 0; l < 4; ++l) { - for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) { - y[ib].qs[32*l+4*k+i+ 0] = x4[k][ib].qs[i+4*l+ 0]; - y[ib].qs[32*l+4*k+i+16] = x4[k][ib].qs[i+4*l+16]; + for (int k = 0; k < 8; ++k) for (int i = 0; i < 4; ++i) { + y[ib].qs[32*l+4*k+i+ 0] = x8[k][ib].qs[i+4*l+ 0]; + y[ib].qs[32*l+4*k+i+128] = x8[k][ib].qs[i+4*l+16]; } } } - x += 4*nblock; + x += 8*nblock; y += nblock; } } size_t quantize_q8_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - GGML_ASSERT(nrows%4 == 0); + GGML_ASSERT(nrows%8 == 0); auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); - std::vector<char> qtmp(4*row_size_0); + std::vector<char> qtmp(8*row_size_0); char * qrow = (char *)dst; - for (int row = 0; row < nrows; row += 4) { - quantize_q8_0(src, qtmp.data(), 4, n_per_row, imatrix); - repack_q8_0(4, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_x4 *)qrow); - src += 4*n_per_row; - qrow += 4*row_size_0; + for (int row = 0; row < nrows; row += 8) { + quantize_q8_0(src, qtmp.data(), 8, n_per_row, imatrix); + repack_q8_0(8, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_r8 *)qrow); + src += 8*n_per_row; + qrow += 8*row_size_0; } return nrows*row_size_0; } -void dequantize_row_q8_0_r4(const block_q8_0_x4 * x, float * y, int64_t k) { +void dequantize_row_q8_0_r4(const block_q8_0_r8 * x, float * y, int64_t k) { // we assume we are called with 4 rows - int n_per_row = k/4; + int n_per_row = k/8; int nb = n_per_row/QK8_0; - float * yk[4]; - for (int k = 0; k < 4; ++k) yk[k] = y + k*n_per_row; + float * yk[8]; + for (int k = 0; k < 8; ++k) yk[k] = y + k*n_per_row; for (int ib = 0; ib < nb; ++ib) { - for (int k = 0; k < 4; ++k) { + for (int k = 0; k < 8; ++k) { float scale = GGML_FP16_TO_FP32(x[ib].d[k]); for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) { - yk[k][QK8_0*ib+4*l+i+ 0] = scale * x[ib].qs[QK8_0*l+4*k+i+ 0]; - yk[k][QK8_0*ib+4*l+i+16] = scale * x[ib].qs[QK8_0*l+4*k+i+16]; + yk[k][QK8_0*ib+4*l+i+ 0] = scale * x[ib].qs[32*l+4*k+i+ 0]; + yk[k][QK8_0*ib+4*l+i+16] = scale * x[ib].qs[32*l+4*k+i+128]; } } } @@ -3987,93 +3987,77 @@ void vec_dot_q6_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_iq4_xs_r4_ref(const float * x, block_iq4_xs_r4 * y, int64_t k) { - quantize_iq4_xs_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq4_xs_r4(x, (void *)y, 8, k/8, nullptr); } void quantize_row_iq4_xs_r4(const float * x, void * y, int64_t k) { - quantize_iq4_xs_r4(x, y, 4, k/4, nullptr); + quantize_iq4_xs_r4(x, y, 8, k/8, nullptr); } static void repack_iq4_xs(int nrows, int n_per_row, const block_iq4_xs * x, block_iq4_xs_r4 * y) { - GGML_ASSERT(nrows%4 == 0); + GGML_ASSERT(nrows%8 == 0); GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; - const block_iq4_xs * x4[4]; - for (int row = 0; row < nrows; row += 4) { - for (int k = 0; k < 4; ++k) x4[k] = x + nblock*k; + const block_iq4_xs * x8[8]; + for (int row = 0; row < nrows; row += 8) { + for (int k = 0; k < 8; ++k) x8[k] = x + nblock*k; for (int ibl = 0; ibl < nblock; ++ibl) { - std::memset(y[ibl].scales_l, 0, QK_K/16); - std::memset(y[ibl].scales_h, 0, QK_K/32); - for (int k = 0; k < 4; ++k) { - y[ibl].d[k] = x4[k][ibl].d; + std::memset(y[ibl].scales_l, 0, QK_K/8); + std::memset(y[ibl].scales_h, 0, QK_K/16); + for (int k = 0; k < 8; ++k) { + y[ibl].d[k] = x8[k][ibl].d; for (int ib = 0; ib < QK_K/32; ++ib) { - uint8_t sl = (x4[k][ibl].scales_l[ib/2] >> 4*(ib%2)) & 0xf; - uint8_t sh = (x4[k][ibl].scales_h >> 2*ib) & 3; - int i = 4*ib + k; - y[ibl].scales_l[i%16] |= (sl << 4*(i/16)); - y[ibl].scales_h[i%8 ] |= (sh << 2*(i/8)); - } - } - for (int ib = 0; ib < QK_K/32; ++ib) { - for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) { - y[ibl].qs[64*ib+4*k+i+ 0] = (x4[k][ibl].qs[16*ib+i+0] & 0xf) | ((x4[k][ibl].qs[16*ib+i+ 8] & 0x0f) << 4); // 0....3 + 8...11 from each row - y[ibl].qs[64*ib+4*k+i+16] = (x4[k][ibl].qs[16*ib+i+0] >> 4) | ((x4[k][ibl].qs[16*ib+i+ 8] & 0xf0)); // 16...19 + 24...27 from each row - y[ibl].qs[64*ib+4*k+i+32] = (x4[k][ibl].qs[16*ib+i+4] & 0xf) | ((x4[k][ibl].qs[16*ib+i+12] & 0x0f) << 4); // 4....7 + 12...15 from each row - y[ibl].qs[64*ib+4*k+i+48] = (x4[k][ibl].qs[16*ib+i+4] >> 4) | ((x4[k][ibl].qs[16*ib+i+12] & 0xf0)); // 20...23 + 28...31 from each row + uint8_t sl = (x8[k][ibl].scales_l[ib/2] >> 4*(ib%2)) & 0xf; + uint8_t sh = (x8[k][ibl].scales_h >> 2*ib) & 3; + int i = 8*ib + k; + y[ibl].scales_l[i%32] |= (sl << 4*(i/32)); + y[ibl].scales_h[i%16] |= (sh << 2*(i/16)); + for (int i = 0; i < 4; ++i) { + y[ibl].qs[128*ib+4*k+i+ 0] = (x8[k][ibl].qs[16*ib+i+0] & 0xf) | ((x8[k][ibl].qs[16*ib+i+ 4] & 0xf) << 4); + y[ibl].qs[128*ib+4*k+i+32] = (x8[k][ibl].qs[16*ib+i+8] & 0xf) | ((x8[k][ibl].qs[16*ib+i+12] & 0xf) << 4); + y[ibl].qs[128*ib+4*k+i+64] = (x8[k][ibl].qs[16*ib+i+0] >> 4) | ((x8[k][ibl].qs[16*ib+i+ 4] >> 4) << 4); + y[ibl].qs[128*ib+4*k+i+96] = (x8[k][ibl].qs[16*ib+i+8] >> 4) | ((x8[k][ibl].qs[16*ib+i+12] >> 4) << 4); + } } } } - x += 4*nblock; + x += 8*nblock; y += nblock; } } size_t quantize_iq4_xs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - GGML_ASSERT(nrows%4 == 0); + GGML_ASSERT(nrows%8 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; auto row_size = ggml_row_size(GGML_TYPE_IQ4_XS, n_per_row); - std::vector<char> qtmp(4*row_size); - for (int row = 0; row < nrows; row += 4) { - quantize_iq4_xs(src, (void *)qtmp.data(), 4, n_per_row, imatrix); - repack_iq4_xs(4, n_per_row, (const block_iq4_xs *)qtmp.data(), (block_iq4_xs_r4 *)qcur); - qcur += 4*row_size; - src += 4*n_per_row; + std::vector<char> qtmp(8*row_size); + for (int row = 0; row < nrows; row += 8) { + quantize_iq4_xs(src, (void *)qtmp.data(), 8, n_per_row, imatrix); + repack_iq4_xs(8, n_per_row, (const block_iq4_xs *)qtmp.data(), (block_iq4_xs_r4 *)qcur); + qcur += 8*row_size; + src += 8*n_per_row; } return nrows*row_size; } void dequantize_row_iq4_xs_r4(const block_iq4_xs_r4 * x, float * y, int64_t k) { - auto n_per_row = k/4; - float * y4[4] = {y, y + n_per_row, y + 2*n_per_row, y + 3*n_per_row}; + auto n_per_row = k/8; + float * y8[8]; + for (int k = 0; k < 8; ++k) y8[k] = y + n_per_row*k; int nblock = n_per_row/QK_K; for (int ibl = 0; ibl < nblock; ++ibl) { - for (int k = 0; k < 4; ++k) { + for (int k = 0; k < 8; ++k) { const float d = GGML_FP16_TO_FP32(x[ibl].d[k]); for (int ib = 0; ib < QK_K/32; ++ib) { - int is = 4*ib + k; - float dl = d * ((((x[ibl].scales_l[is%16] >> 4*(is/16)) & 0xf) | (((x[ibl].scales_h[is%8] >> 2*(is/8)) & 3) << 4)) - 32); - for (int i = 0; i < 4; ++i) { - y4[k][QK_K*ibl+32*ib+i+ 0] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+ 0] & 0xf]; - y4[k][QK_K*ibl+32*ib+i+ 8] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+ 0] >> 4]; - y4[k][QK_K*ibl+32*ib+i+16] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+16] & 0xf]; - y4[k][QK_K*ibl+32*ib+i+24] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+16] >> 4]; - y4[k][QK_K*ibl+32*ib+i+ 4] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+32] & 0xf]; - y4[k][QK_K*ibl+32*ib+i+12] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+32] >> 4]; - y4[k][QK_K*ibl+32*ib+i+20] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+48] & 0xf]; - y4[k][QK_K*ibl+32*ib+i+28] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+48] >> 4]; + int is = 8*ib + k; + float dl = d * ((((x[ibl].scales_l[is%32] >> 4*(is/32)) & 0xf) | (((x[ibl].scales_h[is%16] >> 2*(is/16)) & 3) << 4)) - 32); + for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) { + y8[k][QK_K*ibl+32*ib+8*l+i+0] = dl * iq4k_values[x[ibl].qs[128*ib+4*k+i+32*l] & 0xf]; + y8[k][QK_K*ibl+32*ib+8*l+i+4] = dl * iq4k_values[x[ibl].qs[128*ib+4*k+i+32*l] >> 4]; } } } - //dequantize_row_iq4_xs(x + ib, ytmp, QK_K); - //for (int k = 0; k < 4; ++k) { - // for (int l = 0; l < 16; ++l) { - // for (int i = 0; i < 4; ++i) { - // //y4[k][ib*kBlockSize + i + 16*(l%4) + 4*(l/4)] = ytmp[16*l + 4*k + i]; - // y4[k][ib*kBlockSize + i + 8*(l%8) + 4*(l/8)] = ytmp[16*l + 4*k + i]; - // } - // } - //} } } @@ -6063,7 +6047,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} }, { GGML_TYPE_IQ4_K, { GGML_TYPE_IQ4_K_R4, 4, (Repack::repack_func)repack_iq4_k} }, { GGML_TYPE_IQ5_K, { GGML_TYPE_IQ5_K_R4, 4, (Repack::repack_func)repack_iq5_k} }, - { GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 4, (Repack::repack_func)repack_iq4_xs} }, + { GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 8, (Repack::repack_func)repack_iq4_xs} }, { GGML_TYPE_IQ4_KS, { GGML_TYPE_IQ4_KS_R4, 4, (Repack::repack_func)repack_iq4_ks} }, { GGML_TYPE_IQ4_NL, { GGML_TYPE_IQ4_NL_R4, 4, (Repack::repack_func)repack_iq4_nl} }, { GGML_TYPE_IQ2_BN, { GGML_TYPE_IQ2_BN_R4, 4, (Repack::repack_func)repack_iq2_bn} }, @@ -6080,7 +6064,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_Q4_0, { GGML_TYPE_Q4_0_R4, 4, (Repack::repack_func)repack_q4_0} }, { GGML_TYPE_Q5_0, { GGML_TYPE_Q5_0_R4, 4, (Repack::repack_func)repack_q5_0} }, { GGML_TYPE_Q6_0, { GGML_TYPE_Q6_0_R4, 4, (Repack::repack_func)repack_q6_0} }, - { GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} }, + { GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 8, (Repack::repack_func)repack_q8_0} }, { GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} }, #ifdef __AVX512BF16__ { GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>}}, |