From 7c4263d4261d6ee6f0539d53eb9e1b4d120ba8af Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Wed, 28 Feb 2024 10:37:02 +0200 Subject: ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (#5760) * WIP: make i-quants work for QK_K = 64 * iq2_xs: attempt to fix AVX dot product for QK_K = 64 Tests pass, but I get gibberish. * QK_K = 64 tests pass on ARM_NEON and Metal Sadly, that does not mean it actually works. * Make CUDA compile with QK_K = 64 Tests don't pass, plus we get misaligned access * Q2_K: fixed bug in imatrix quantization for QK_K = 64 * iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work) --------- Co-authored-by: Iwan Kawrakow --- ggml-quants.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'ggml-quants.h') diff --git a/ggml-quants.h b/ggml-quants.h index 2c61134c..316e3568 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -230,6 +230,10 @@ typedef struct { } block_iq4_nl; static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); +#if QK_K == 64 +#define block_iq4_xs block_iq4_nl +//typedef struct block_iq4_nl block_iq4_xs; +#else typedef struct { ggml_fp16_t d; uint16_t scales_h; @@ -237,6 +241,7 @@ typedef struct { uint8_t qs[QK_K/2]; } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); +#endif #ifdef __cplusplus extern "C" { -- cgit v1.2.3