summaryrefslogtreecommitdiff
path: root/ggml-common.h
diff options
context:
space:
mode:
authorIwan Kawrakow <iwan.kawrakow@gmail.com>2024-06-18 12:00:16 +0300
committerIwan Kawrakow <iwan.kawrakow@gmail.com>2024-06-22 12:02:52 +0300
commit8c6276f6a1c6d9d82b5f0114d838fcc4f277234a (patch)
tree6f80e1f3b1017d3d3e88082db0399c6884bc9725 /ggml-common.h
parent1de6476d751a02978b035feb38066462c4382877 (diff)
Bitnet: 2.25 bpw version
Just scaler and AVX2 for now. PP-512 is even faster (325 t/s on the Ryzn-7950X, 404 t/s on Ryzen-5975WX). We lose ~6-7% for TG due to being memory bound and the model being 10% larger.
Diffstat (limited to 'ggml-common.h')
-rw-r--r--ggml-common.h5
1 files changed, 3 insertions, 2 deletions
diff --git a/ggml-common.h b/ggml-common.h
index c7f865e8..f5a35960 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -381,13 +381,14 @@ typedef struct {
} block_iq1_bn;
static_assert(sizeof(block_iq1_bn) == sizeof(uint16_t) + QK_IQ1BN/8 + QK_IQ1BN/16, "wrong iq1_bn block size/padding");
//
-// Bitnet - implemented as 2.0 bpw
+// Bitnet - implemented as 2.25 bpw
//
#define QK_IQ2BN 64
typedef struct {
+ ggml_half d;
uint8_t qs[QK_IQ2BN/4];
} block_iq2_bn;
-static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/padding");
+static_assert(sizeof(block_iq2_bn) == sizeof(ggml_half) + QK_IQ2BN/4, "wrong iq2_bn block size/padding");
// Used by IQ1_M quants
typedef union {