diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-01-27 16:50:07 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-01-27 16:50:07 +0200 |
commit | d9c4ea48d1e41d8f7215ff1c094d75e7229b65e2 (patch) | |
tree | ef32b816f715ae7217da01217e506c7ed31b537e /ggml/src/ggml-common.h | |
parent | 814d3e054cc5ac483ddb9933aa13ba385e979b68 (diff) |
Interleave 8 rows (Q8_0, IQ4_XS) (#178)
* Try interleaving 8 rows for iq4_xs
On Zen4, PP-512 goes up from ~260 t/s to 288 t/s for L3-8B.
TG-128 reaches max. performance at 2 threads and is slightly
higher than 4 interleaved rows (14.48 t/s vs 13.11 t/s @ 2 threads
and 14/28 t/s @ 4 threads).
* Try interleaving 8 iq4_xs rows
It is also faster on AVX2.
This is the NEON implementation. It is tiny bit faster than
4 interleaved rows (~0.5%).
So, this looks like a winner given the Zen4/AVX2 improvement
without associated NEON egression.
* Cleanup
* 8-rows interleaved q8_0 (AVX2)
* 8-rows interleaved q8_0 (Zen4)
* 8-rows interleaved q8_0 (Zen4) - slightly better
PP-512 is now 284 t/s compared to 257 t/s for 4-rows interleaved.
TG-128 reaches peak of 8.16 t/s at just 2 threads compared
to 7.95 t/s @ 4 threads before.
* 8-rows interleaved q8_0 (NEON)
PP-512 is slightly better (138 t/s vs 132.5 t/s), TG-128 is about the
same.
* FA: repack Q8_0 to Q8_0_R8
* Remove special purpose mul_mat_q8_0_r4_q8_1_128 (Zen4)
* FA: repack Q8_0 to Q8_0_R8 (NEON)
Very slightly faster than the general purpose gemm, slightly
slower than the D = 128 special case gemm mul_mat_q8_0_r4_q8_0_128.
Still removing mul_mat_q8_0_r4_q8_0_128 as we simply don't have
enough vector registers to hold 8 interleaved rows, so there is
no point to have the special purpose implementation.
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-common.h')
-rw-r--r-- | ggml/src/ggml-common.h | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 7f79b27b..d08870ad 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -236,6 +236,11 @@ typedef struct { int8_t qs[4*QK8_0]; } block_q8_0_x4; static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding"); +typedef struct { + ggml_half d[8]; + int8_t qs[8*QK8_0]; +} block_q8_0_r8; +static_assert(sizeof(block_q8_0_r8) == 8*sizeof(block_q8_0), "wrong q8_0_r8 block size/padding"); typedef struct { ggml_half d[4]; // deltas for 4 q4_0 blocks @@ -534,12 +539,12 @@ typedef struct { static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); typedef struct { - ggml_half d[4]; - uint8_t scales_h[QK_K/32]; - uint8_t scales_l[QK_K/16]; - uint8_t qs[QK_K*2]; + ggml_half d[8]; + uint8_t scales_h[QK_K/16]; + uint8_t scales_l[QK_K/ 8]; + uint8_t qs[QK_K*4]; } block_iq4_xs_r4; -static_assert(sizeof(block_iq4_xs_r4) == 4*sizeof(ggml_half) + QK_K/32 + QK_K/16 + QK_K*2, "wrong iq4_xs_rs block size/padding"); +static_assert(sizeof(block_iq4_xs_r4) == 8*sizeof(block_iq4_xs), "wrong iq4_xs_rs block size/padding"); typedef struct { uint8_t scales[QK_K/32]; |