iq1bn(no lookup): better version

We have 4 groups of 16 in a block of 64 quants. For each group of 16 we have 3 groups of 5, each using 8 bits. The remaining 16'th quants of the 4 groups of 16 are encoded with 8 bits using the same encoding as the groups of 5. The only kernel where we have complications is the CUDA dequantize kernel (because we are dequantizing 8 quants there, and we have different encoding for the 1st and 2nd group of 8 in a group of 16). Ths achieves better performance on all tested platforms than any previous 1.625 bpw attempt. We have: | model | size | params | backend | threads | test | t/s | | ---------------- | ---------: | ---------: | ---------- | ------: | ------------: | ---------------: | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | CUDA | 8 | pp512 | 9613.02 ± 24.54 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | CUDA | 8 | tg128 | 229.85 ± 0.33 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | AVX2 | 16 | pp512 | 322.59 ± 1.00 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | AVX2 | 16 | tg128 | 59.79 ± 0.03 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | AVX2 | 8 | tg128 | 57.62 ± 0.21 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | AVX2 | 4 | tg128 | 33.66 ± 0.29 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | AVX2 | 2 | tg128 | 18.30 ± 0.01 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | Metal | 8 | pp512 | 698.13 ± 0.21 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | Metal | 8 | tg128 | 68.88 ± 0.24 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | NEON | 8 | pp512 | 196.80 ± 0.50 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | NEON | 8 | tg128 | 51.58 ± 0.41 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | NEON | 4 | tg128 | 30.80 ± 0.03 | | 1.625 bpw Bitnet | 729.64 MiB | 3.32 B | NEON | 2 | tg128 | 16.89 ± 0.01 | It is still slower than 2 bpw Bitnet, but the difference now is not as dramatic.
author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-07-17 08:54:11 +0300
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-07-17 08:54:11 +0300
commit: 873a790ee22538d1d9d7205db7210c70955ab1e1 (patch)
tree: 426f0ce20cf45c325be28ae8bceb51d42c072452 /ggml-common.h
parent: 52a25e307c3af8686436d977c60e9975b0900e2b (diff)
1 files changed, 2 insertions, 3 deletions
diff --git a/ggml-common.h b/ggml-common.h
index bf95da2a..f515e95c 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -380,11 +380,10 @@ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m bl
 //
 #define QK_IQ1BN 64
 typedef struct {
+    uint8_t ql[12];
     uint8_t extra;
-    uint8_t ql[QK_IQ1BN/8];
-    uint8_t qh[QK_IQ1BN/16];
 } block_iq1_bn;
-static_assert(sizeof(block_iq1_bn) == sizeof(uint8_t) + QK_IQ1BN/8 + QK_IQ1BN/16, "wrong iq1_bn block size/padding");
+static_assert(sizeof(block_iq1_bn) == 13, "wrong iq1_bn block size/padding");
 //
 // Bitnet - implemented as 2.25 bpw
 //
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-07-17 08:54:11 +0300
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-07-17 08:54:11 +0300
commit	873a790ee22538d1d9d7205db7210c70955ab1e1 (patch)
tree	426f0ce20cf45c325be28ae8bceb51d42c072452 /ggml-common.h
parent	52a25e307c3af8686436d977c60e9975b0900e2b (diff)