From 45fae1a14444622478774f9a417e1d417af1ca46 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 14 Jul 2025 18:55:08 +0200 Subject: Adding IQ2_KL (#602) * Experiments for 2.6875 bpw quants At least according to rmse, this is significantly better than q2_K, while using only 1/16 more bits per weight. * iq2_kl: basics * iq2_kl: CUDA dequantize * iq2_kl: small improvement in PPL Also check the two neighbouring values for the block scale and use the one that minimizes RMSE. * iq2_kl: MMQ Quite good: PP-512(L3-8B) = 8472 t/s. * iq2_kl: MMVQ We get PP-128(L3-8B) = 162 t/s. Which means that this is not quite as good as it should be as (almost) same bpq q2_K is at 170 t/s. * iq2_kl: Zen4 GEMM/GEMV Not particularly fast. I may need to think about rearranging the bits. * iq2_kl: better Zen4 * iq2_kl: convert/repack to q8_k_r8 (AVX2) * iq2_kl: AVX2 GEMM/GEMV * iq2_kl: WIP NEON The compiler started crashing!!! * iq2_kl: NEON Had to work around a compiler crash when using vzip2q_u8 using vqtbl2q_u8. * iq2_kl: convert/repack to q8_k_r8 (NEON) * iq2_kl: Metal dequantize * iq2_kl: Metal GEMV - pretty slow * iq2_kl: Metal GEMV - slightly better (40 t/s -> 44.5 t/s) * iq2_kl: Metal GEMV - slightly better (44.5 t/s -> 46.5 t/s) * iq2_kl: Metal GEMV - slightly better (46.5 t/s -> 47.2 t/s) * iq2_kl: slightly better Metal dequantize PP-512 goes to 476 t/s up from 466 t/s. * iq2_kl: slightly better Metal dequantize PP-512 goes to 492 t/s up from 476 t/s. * Add iq2_kl to constants.py --------- Co-authored-by: Iwan Kawrakow --- ggml/src/ggml-common.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'ggml/src/ggml-common.h') diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index a1f97911..6dc439b8 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -606,6 +606,14 @@ typedef struct { } block_iq2_k; static_assert(sizeof(block_iq2_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/32 + QK_K/4, "wrong iq2_k block size/padding"); +typedef struct { + uint16_t scales_h; + uint8_t scales_l[QK_K/64]; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/16]; +} block_iq2_kl; +static_assert(sizeof(block_iq2_kl) == sizeof(uint16_t) + QK_K/64 + QK_K/4 + QK_K/16, "wrong iq2_kl block size/padding"); + typedef struct { ggml_half d[4]; uint8_t extra[8]; @@ -2164,6 +2172,12 @@ GGML_TABLE_BEGIN(int8_t, iq2nl_values, 8) -31, -13, 1, 17, -26, -8, 6, 22 GGML_TABLE_END() +GGML_TABLE_BEGIN(uint16_t, iq2kl_values, 32) + 0xe9c1, 0x0dc1, 0xc1d8, 0xf6d8, 0x0dd8, 0x2fd8, 0xd8e9, 0xe9e9, 0x01e9, 0x0de9, 0x1ce9, 0xc1f6, 0x01f6, 0x0df6, 0x2ff6, 0xe901, + 0xf601, 0x0101, 0x0d01, 0x1c01, 0xd80d, 0xe90d, 0xf60d, 0x010d, 0x0d0d, 0xc11c, 0xe91c, 0x011c, 0x1c1c, 0x2f1c, 0xe92f, 0x0d2f, +GGML_TABLE_END() + + GGML_TABLE_BEGIN(int8_t, iq3nl_values, 16) -63, -40, -23, -10, 1, 13, 28, 47, -59, -36, -19, -6, 5, 17, 32, 51, -- cgit v1.2.3