Use bf16 instead of fp16 block scales for q8_1 (#292)

* WIP - not working * q8_0 without bells and wistles works * It works for q8_0 * Use bf16 instead of f16,int16 * q4_0_r8 * q5_0_r4 * q6_0_r4 * Also q4_1 and q5_1 * q8_0_r8 on avx2 --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2025-03-27 05:49:16 +0100
committer: GitHub <noreply@github.com> 2025-03-27 05:49:16 +0100
commit: d0b52076da0261f291b01f1ffa44884c8b2cdb1c (patch)
tree: 93abea8ae30140fbd6733af91eede57c2243e91d /ggml/src/ggml-common.h
parent: a22250df93fd833a6cb7f310b159ad1b54e4d582 (diff)
1 files changed, 14 insertions, 0 deletions
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 4308f0b9..59702e32 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -266,6 +266,20 @@ typedef struct {
 } block_q8_0x8;
 static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
 
+#define QK8_2 32
+typedef struct {
+    uint16_t d;
+    uint16_t s;
+    int8_t qs[QK8_2]; // quants
+} block_q8_2;
+static_assert(sizeof(block_q8_2) == sizeof(ggml_half) + sizeof(int16_t) + QK8_2, "wrong q8_2 block size/padding");
+
+typedef struct {
+    uint16_t d[8];
+    int8_t qs[4*QK8_2];
+} block_q8_2_x4;
+static_assert(sizeof(block_q8_2_x4) == 4*sizeof(block_q8_2), "wrong q8_2_x4 block size/padding");
+
 //
 // Super-block quantization structures
 //
author	Kawrakow <iwankawrakow@gmail.com>	2025-03-27 05:49:16 +0100
committer	GitHub <noreply@github.com>	2025-03-27 05:49:16 +0100
commit	d0b52076da0261f291b01f1ffa44884c8b2cdb1c (patch)
tree	93abea8ae30140fbd6733af91eede57c2243e91d /ggml/src/ggml-common.h
parent	a22250df93fd833a6cb7f310b159ad1b54e4d582 (diff)