Make IQ1_M work for QK_K = 64 (#6327)

* iq1_m: make it work for QK_K = 64 (WIP) * iq1_m: make it work for QK_K = 64 (scalar and AVX2) * iq1_m: QK_K = 64 seems to work on Metal and ARM_NEON --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <48489457+ikawrakow@users.noreply.github.com> 2024-03-27 08:44:27 +0100
committer: GitHub <noreply@github.com> 2024-03-27 08:44:27 +0100
commit: cbc83436197cde617cad696e665879c20df77daa (patch)
tree: 303401b02237154b13d8cb9cbb32f0df8dcacdbb /ggml-common.h
parent: e562b9714b9b3e242361a7f74bbbeb00f6bd99ac (diff)
1 files changed, 9 insertions, 2 deletions
diff --git a/ggml-common.h b/ggml-common.h
index 517c9bb4..b2d67d5d 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -377,13 +377,20 @@ typedef struct {
 } block_iq1_s;
 static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
 
-// 1.8125 bpw
+// 1.75 bpw
 typedef struct {
     uint8_t  qs[QK_K/8];      // grid index, low 8 bits
     uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
-    uint8_t  scales[QK_K/32]; // 4-bit block scales
+#if QK_K == 64
+    ggml_half d;
+#endif
+    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
 } block_iq1_m;
+#if QK_K == 64
+static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
+#else
 static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
+#endif
 
 // Used by IQ1_M quants
 typedef union {
author	Kawrakow <48489457+ikawrakow@users.noreply.github.com>	2024-03-27 08:44:27 +0100
committer	GitHub <noreply@github.com>	2024-03-27 08:44:27 +0100
commit	cbc83436197cde617cad696e665879c20df77daa (patch)
tree	303401b02237154b13d8cb9cbb32f0df8dcacdbb /ggml-common.h
parent	e562b9714b9b3e242361a7f74bbbeb00f6bd99ac (diff)