summaryrefslogtreecommitdiff
path: root/ggml/src/iqk/iqk_quantize.cpp
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-01-27 16:50:07 +0200
committerGitHub <noreply@github.com>2025-01-27 16:50:07 +0200
commitd9c4ea48d1e41d8f7215ff1c094d75e7229b65e2 (patch)
treeef32b816f715ae7217da01217e506c7ed31b537e /ggml/src/iqk/iqk_quantize.cpp
parent814d3e054cc5ac483ddb9933aa13ba385e979b68 (diff)
Interleave 8 rows (Q8_0, IQ4_XS) (#178)
* Try interleaving 8 rows for iq4_xs On Zen4, PP-512 goes up from ~260 t/s to 288 t/s for L3-8B. TG-128 reaches max. performance at 2 threads and is slightly higher than 4 interleaved rows (14.48 t/s vs 13.11 t/s @ 2 threads and 14/28 t/s @ 4 threads). * Try interleaving 8 iq4_xs rows It is also faster on AVX2. This is the NEON implementation. It is tiny bit faster than 4 interleaved rows (~0.5%). So, this looks like a winner given the Zen4/AVX2 improvement without associated NEON egression. * Cleanup * 8-rows interleaved q8_0 (AVX2) * 8-rows interleaved q8_0 (Zen4) * 8-rows interleaved q8_0 (Zen4) - slightly better PP-512 is now 284 t/s compared to 257 t/s for 4-rows interleaved. TG-128 reaches peak of 8.16 t/s at just 2 threads compared to 7.95 t/s @ 4 threads before. * 8-rows interleaved q8_0 (NEON) PP-512 is slightly better (138 t/s vs 132.5 t/s), TG-128 is about the same. * FA: repack Q8_0 to Q8_0_R8 * Remove special purpose mul_mat_q8_0_r4_q8_1_128 (Zen4) * FA: repack Q8_0 to Q8_0_R8 (NEON) Very slightly faster than the general purpose gemm, slightly slower than the D = 128 special case gemm mul_mat_q8_0_r4_q8_0_128. Still removing mul_mat_q8_0_r4_q8_0_128 as we simply don't have enough vector registers to hold 8 interleaved rows, so there is no point to have the special purpose implementation. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp150
1 files changed, 67 insertions, 83 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 221bc48c..59a36c5c 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -3709,63 +3709,63 @@ void vec_dot_q4_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
//
// ========================================= q8_0_r4
//
-void quantize_row_q8_0_r4_ref(const float * x, block_q8_0_x4 * y, int64_t k) {
+void quantize_row_q8_0_r4_ref(const float * x, block_q8_0_r8 * y, int64_t k) {
// we assume we are called with 4 rows
- quantize_q8_0_r4(x, (void *)y, 4, k/4, nullptr);
+ quantize_q8_0_r4(x, (void *)y, 8, k/8, nullptr);
}
void quantize_row_q8_0_r4(const float * x, void * y, int64_t k) {
// we assume we are called with 4 rows
- quantize_q8_0_r4(x, y, 4, k/4, nullptr);
+ quantize_q8_0_r4(x, y, 8, k/8, nullptr);
}
-static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_x4 * y) {
- GGML_ASSERT(nrows%4 == 0);
+static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_r8 * y) {
+ GGML_ASSERT(nrows%8 == 0);
GGML_ASSERT(n_per_row%QK8_0 == 0);
int nblock = n_per_row/QK8_0;
- const block_q8_0 * x4[4];
- for (int row = 0; row < nrows; row += 4) {
- for (int k = 0; k < 4; ++k) x4[k] = x + nblock*k;
+ const block_q8_0 * x8[8];
+ for (int row = 0; row < nrows; row += 8) {
+ for (int k = 0; k < 8; ++k) x8[k] = x + nblock*k;
for (int ib = 0; ib < nblock; ++ib) {
- for (int k = 0; k < 4; ++k) y[ib].d[k] = x4[k][ib].d;
+ for (int k = 0; k < 8; ++k) y[ib].d[k] = x8[k][ib].d;
for (int l = 0; l < 4; ++l) {
- for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) {
- y[ib].qs[32*l+4*k+i+ 0] = x4[k][ib].qs[i+4*l+ 0];
- y[ib].qs[32*l+4*k+i+16] = x4[k][ib].qs[i+4*l+16];
+ for (int k = 0; k < 8; ++k) for (int i = 0; i < 4; ++i) {
+ y[ib].qs[32*l+4*k+i+ 0] = x8[k][ib].qs[i+4*l+ 0];
+ y[ib].qs[32*l+4*k+i+128] = x8[k][ib].qs[i+4*l+16];
}
}
}
- x += 4*nblock;
+ x += 8*nblock;
y += nblock;
}
}
size_t quantize_q8_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
- GGML_ASSERT(nrows%4 == 0);
+ GGML_ASSERT(nrows%8 == 0);
auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
- std::vector<char> qtmp(4*row_size_0);
+ std::vector<char> qtmp(8*row_size_0);
char * qrow = (char *)dst;
- for (int row = 0; row < nrows; row += 4) {
- quantize_q8_0(src, qtmp.data(), 4, n_per_row, imatrix);
- repack_q8_0(4, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_x4 *)qrow);
- src += 4*n_per_row;
- qrow += 4*row_size_0;
+ for (int row = 0; row < nrows; row += 8) {
+ quantize_q8_0(src, qtmp.data(), 8, n_per_row, imatrix);
+ repack_q8_0(8, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_r8 *)qrow);
+ src += 8*n_per_row;
+ qrow += 8*row_size_0;
}
return nrows*row_size_0;
}
-void dequantize_row_q8_0_r4(const block_q8_0_x4 * x, float * y, int64_t k) {
+void dequantize_row_q8_0_r4(const block_q8_0_r8 * x, float * y, int64_t k) {
// we assume we are called with 4 rows
- int n_per_row = k/4;
+ int n_per_row = k/8;
int nb = n_per_row/QK8_0;
- float * yk[4];
- for (int k = 0; k < 4; ++k) yk[k] = y + k*n_per_row;
+ float * yk[8];
+ for (int k = 0; k < 8; ++k) yk[k] = y + k*n_per_row;
for (int ib = 0; ib < nb; ++ib) {
- for (int k = 0; k < 4; ++k) {
+ for (int k = 0; k < 8; ++k) {
float scale = GGML_FP16_TO_FP32(x[ib].d[k]);
for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) {
- yk[k][QK8_0*ib+4*l+i+ 0] = scale * x[ib].qs[QK8_0*l+4*k+i+ 0];
- yk[k][QK8_0*ib+4*l+i+16] = scale * x[ib].qs[QK8_0*l+4*k+i+16];
+ yk[k][QK8_0*ib+4*l+i+ 0] = scale * x[ib].qs[32*l+4*k+i+ 0];
+ yk[k][QK8_0*ib+4*l+i+16] = scale * x[ib].qs[32*l+4*k+i+128];
}
}
}
@@ -3987,93 +3987,77 @@ void vec_dot_q6_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
//
void quantize_row_iq4_xs_r4_ref(const float * x, block_iq4_xs_r4 * y, int64_t k) {
- quantize_iq4_xs_r4(x, (void *)y, 4, k/4, nullptr);
+ quantize_iq4_xs_r4(x, (void *)y, 8, k/8, nullptr);
}
void quantize_row_iq4_xs_r4(const float * x, void * y, int64_t k) {
- quantize_iq4_xs_r4(x, y, 4, k/4, nullptr);
+ quantize_iq4_xs_r4(x, y, 8, k/8, nullptr);
}
static void repack_iq4_xs(int nrows, int n_per_row, const block_iq4_xs * x, block_iq4_xs_r4 * y) {
- GGML_ASSERT(nrows%4 == 0);
+ GGML_ASSERT(nrows%8 == 0);
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
- const block_iq4_xs * x4[4];
- for (int row = 0; row < nrows; row += 4) {
- for (int k = 0; k < 4; ++k) x4[k] = x + nblock*k;
+ const block_iq4_xs * x8[8];
+ for (int row = 0; row < nrows; row += 8) {
+ for (int k = 0; k < 8; ++k) x8[k] = x + nblock*k;
for (int ibl = 0; ibl < nblock; ++ibl) {
- std::memset(y[ibl].scales_l, 0, QK_K/16);
- std::memset(y[ibl].scales_h, 0, QK_K/32);
- for (int k = 0; k < 4; ++k) {
- y[ibl].d[k] = x4[k][ibl].d;
+ std::memset(y[ibl].scales_l, 0, QK_K/8);
+ std::memset(y[ibl].scales_h, 0, QK_K/16);
+ for (int k = 0; k < 8; ++k) {
+ y[ibl].d[k] = x8[k][ibl].d;
for (int ib = 0; ib < QK_K/32; ++ib) {
- uint8_t sl = (x4[k][ibl].scales_l[ib/2] >> 4*(ib%2)) & 0xf;
- uint8_t sh = (x4[k][ibl].scales_h >> 2*ib) & 3;
- int i = 4*ib + k;
- y[ibl].scales_l[i%16] |= (sl << 4*(i/16));
- y[ibl].scales_h[i%8 ] |= (sh << 2*(i/8));
- }
- }
- for (int ib = 0; ib < QK_K/32; ++ib) {
- for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) {
- y[ibl].qs[64*ib+4*k+i+ 0] = (x4[k][ibl].qs[16*ib+i+0] & 0xf) | ((x4[k][ibl].qs[16*ib+i+ 8] & 0x0f) << 4); // 0....3 + 8...11 from each row
- y[ibl].qs[64*ib+4*k+i+16] = (x4[k][ibl].qs[16*ib+i+0] >> 4) | ((x4[k][ibl].qs[16*ib+i+ 8] & 0xf0)); // 16...19 + 24...27 from each row
- y[ibl].qs[64*ib+4*k+i+32] = (x4[k][ibl].qs[16*ib+i+4] & 0xf) | ((x4[k][ibl].qs[16*ib+i+12] & 0x0f) << 4); // 4....7 + 12...15 from each row
- y[ibl].qs[64*ib+4*k+i+48] = (x4[k][ibl].qs[16*ib+i+4] >> 4) | ((x4[k][ibl].qs[16*ib+i+12] & 0xf0)); // 20...23 + 28...31 from each row
+ uint8_t sl = (x8[k][ibl].scales_l[ib/2] >> 4*(ib%2)) & 0xf;
+ uint8_t sh = (x8[k][ibl].scales_h >> 2*ib) & 3;
+ int i = 8*ib + k;
+ y[ibl].scales_l[i%32] |= (sl << 4*(i/32));
+ y[ibl].scales_h[i%16] |= (sh << 2*(i/16));
+ for (int i = 0; i < 4; ++i) {
+ y[ibl].qs[128*ib+4*k+i+ 0] = (x8[k][ibl].qs[16*ib+i+0] & 0xf) | ((x8[k][ibl].qs[16*ib+i+ 4] & 0xf) << 4);
+ y[ibl].qs[128*ib+4*k+i+32] = (x8[k][ibl].qs[16*ib+i+8] & 0xf) | ((x8[k][ibl].qs[16*ib+i+12] & 0xf) << 4);
+ y[ibl].qs[128*ib+4*k+i+64] = (x8[k][ibl].qs[16*ib+i+0] >> 4) | ((x8[k][ibl].qs[16*ib+i+ 4] >> 4) << 4);
+ y[ibl].qs[128*ib+4*k+i+96] = (x8[k][ibl].qs[16*ib+i+8] >> 4) | ((x8[k][ibl].qs[16*ib+i+12] >> 4) << 4);
+ }
}
}
}
- x += 4*nblock;
+ x += 8*nblock;
y += nblock;
}
}
size_t quantize_iq4_xs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
- GGML_ASSERT(nrows%4 == 0);
+ GGML_ASSERT(nrows%8 == 0);
GGML_ASSERT(n_per_row%QK_K == 0);
char * qcur = (char *)dst;
auto row_size = ggml_row_size(GGML_TYPE_IQ4_XS, n_per_row);
- std::vector<char> qtmp(4*row_size);
- for (int row = 0; row < nrows; row += 4) {
- quantize_iq4_xs(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
- repack_iq4_xs(4, n_per_row, (const block_iq4_xs *)qtmp.data(), (block_iq4_xs_r4 *)qcur);
- qcur += 4*row_size;
- src += 4*n_per_row;
+ std::vector<char> qtmp(8*row_size);
+ for (int row = 0; row < nrows; row += 8) {
+ quantize_iq4_xs(src, (void *)qtmp.data(), 8, n_per_row, imatrix);
+ repack_iq4_xs(8, n_per_row, (const block_iq4_xs *)qtmp.data(), (block_iq4_xs_r4 *)qcur);
+ qcur += 8*row_size;
+ src += 8*n_per_row;
}
return nrows*row_size;
}
void dequantize_row_iq4_xs_r4(const block_iq4_xs_r4 * x, float * y, int64_t k) {
- auto n_per_row = k/4;
- float * y4[4] = {y, y + n_per_row, y + 2*n_per_row, y + 3*n_per_row};
+ auto n_per_row = k/8;
+ float * y8[8];
+ for (int k = 0; k < 8; ++k) y8[k] = y + n_per_row*k;
int nblock = n_per_row/QK_K;
for (int ibl = 0; ibl < nblock; ++ibl) {
- for (int k = 0; k < 4; ++k) {
+ for (int k = 0; k < 8; ++k) {
const float d = GGML_FP16_TO_FP32(x[ibl].d[k]);
for (int ib = 0; ib < QK_K/32; ++ib) {
- int is = 4*ib + k;
- float dl = d * ((((x[ibl].scales_l[is%16] >> 4*(is/16)) & 0xf) | (((x[ibl].scales_h[is%8] >> 2*(is/8)) & 3) << 4)) - 32);
- for (int i = 0; i < 4; ++i) {
- y4[k][QK_K*ibl+32*ib+i+ 0] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+ 0] & 0xf];
- y4[k][QK_K*ibl+32*ib+i+ 8] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+ 0] >> 4];
- y4[k][QK_K*ibl+32*ib+i+16] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+16] & 0xf];
- y4[k][QK_K*ibl+32*ib+i+24] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+16] >> 4];
- y4[k][QK_K*ibl+32*ib+i+ 4] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+32] & 0xf];
- y4[k][QK_K*ibl+32*ib+i+12] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+32] >> 4];
- y4[k][QK_K*ibl+32*ib+i+20] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+48] & 0xf];
- y4[k][QK_K*ibl+32*ib+i+28] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+48] >> 4];
+ int is = 8*ib + k;
+ float dl = d * ((((x[ibl].scales_l[is%32] >> 4*(is/32)) & 0xf) | (((x[ibl].scales_h[is%16] >> 2*(is/16)) & 3) << 4)) - 32);
+ for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) {
+ y8[k][QK_K*ibl+32*ib+8*l+i+0] = dl * iq4k_values[x[ibl].qs[128*ib+4*k+i+32*l] & 0xf];
+ y8[k][QK_K*ibl+32*ib+8*l+i+4] = dl * iq4k_values[x[ibl].qs[128*ib+4*k+i+32*l] >> 4];
}
}
}
- //dequantize_row_iq4_xs(x + ib, ytmp, QK_K);
- //for (int k = 0; k < 4; ++k) {
- // for (int l = 0; l < 16; ++l) {
- // for (int i = 0; i < 4; ++i) {
- // //y4[k][ib*kBlockSize + i + 16*(l%4) + 4*(l/4)] = ytmp[16*l + 4*k + i];
- // y4[k][ib*kBlockSize + i + 8*(l%8) + 4*(l/8)] = ytmp[16*l + 4*k + i];
- // }
- // }
- //}
}
}
@@ -6063,7 +6047,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
{ GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} },
{ GGML_TYPE_IQ4_K, { GGML_TYPE_IQ4_K_R4, 4, (Repack::repack_func)repack_iq4_k} },
{ GGML_TYPE_IQ5_K, { GGML_TYPE_IQ5_K_R4, 4, (Repack::repack_func)repack_iq5_k} },
- { GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 4, (Repack::repack_func)repack_iq4_xs} },
+ { GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 8, (Repack::repack_func)repack_iq4_xs} },
{ GGML_TYPE_IQ4_KS, { GGML_TYPE_IQ4_KS_R4, 4, (Repack::repack_func)repack_iq4_ks} },
{ GGML_TYPE_IQ4_NL, { GGML_TYPE_IQ4_NL_R4, 4, (Repack::repack_func)repack_iq4_nl} },
{ GGML_TYPE_IQ2_BN, { GGML_TYPE_IQ2_BN_R4, 4, (Repack::repack_func)repack_iq2_bn} },
@@ -6080,7 +6064,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
{ GGML_TYPE_Q4_0, { GGML_TYPE_Q4_0_R4, 4, (Repack::repack_func)repack_q4_0} },
{ GGML_TYPE_Q5_0, { GGML_TYPE_Q5_0_R4, 4, (Repack::repack_func)repack_q5_0} },
{ GGML_TYPE_Q6_0, { GGML_TYPE_Q6_0_R4, 4, (Repack::repack_func)repack_q6_0} },
- { GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} },
+ { GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 8, (Repack::repack_func)repack_q8_0} },
{ GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} },
#ifdef __AVX512BF16__
{ GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>}},