Bitnet: 2.25 bpw version

Just scaler and AVX2 for now. PP-512 is even faster (325 t/s on the Ryzn-7950X, 404 t/s on Ryzen-5975WX). We lose ~6-7% for TG due to being memory bound and the model being 10% larger.
author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-18 12:00:16 +0300
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-22 12:02:52 +0300
commit: 8c6276f6a1c6d9d82b5f0114d838fcc4f277234a (patch)
tree: 6f80e1f3b1017d3d3e88082db0399c6884bc9725 /ggml-common.h
parent: 1de6476d751a02978b035feb38066462c4382877 (diff)
1 files changed, 3 insertions, 2 deletions
diff --git a/ggml-common.h b/ggml-common.h
index c7f865e8..f5a35960 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -381,13 +381,14 @@ typedef struct {
 } block_iq1_bn;
 static_assert(sizeof(block_iq1_bn) == sizeof(uint16_t) + QK_IQ1BN/8 + QK_IQ1BN/16, "wrong iq1_bn block size/padding");
 //
-// Bitnet - implemented as 2.0 bpw
+// Bitnet - implemented as 2.25 bpw
 //
 #define QK_IQ2BN 64
 typedef struct {
+    ggml_half d;
     uint8_t qs[QK_IQ2BN/4];
 } block_iq2_bn;
-static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/padding");
+static_assert(sizeof(block_iq2_bn) == sizeof(ggml_half) + QK_IQ2BN/4, "wrong iq2_bn block size/padding");
 
 // Used by IQ1_M quants
 typedef union {
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-18 12:00:16 +0300
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-22 12:02:52 +0300
commit	8c6276f6a1c6d9d82b5f0114d838fcc4f277234a (patch)
tree	6f80e1f3b1017d3d3e88082db0399c6884bc9725 /ggml-common.h
parent	1de6476d751a02978b035feb38066462c4382877 (diff)