From 873a790ee22538d1d9d7205db7210c70955ab1e1 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 17 Jul 2024 08:54:11 +0300
Subject: iq1bn(no lookup): better version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have 4 groups of 16 in a block of 64 quants.
For each group of 16 we have 3 groups of 5, each using 8 bits.
The remaining 16'th quants of the 4 groups of 16 are encoded
with 8 bits using the same encoding as the groups of 5.
The only kernel where we have complications is the CUDA dequantize
kernel (because we are dequantizing 8 quants there, and we have
different encoding for the 1st and 2nd group of 8 in a group of 16).

Ths achieves better performance on all tested platforms than
any previous 1.625 bpw attempt. We have:

| model            |       size |     params | backend    | threads |          test |              t/s |
| ---------------- | ---------: | ---------: | ---------- | ------: | ------------: | ---------------: |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | CUDA       |       8 |         pp512 |  9613.02 ± 24.54 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | CUDA       |       8 |         tg128 |    229.85 ± 0.33 |

| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | AVX2       |      16 |         pp512 |    322.59 ± 1.00 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | AVX2       |      16 |         tg128 |     59.79 ± 0.03 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | AVX2       |       8 |         tg128 |     57.62 ± 0.21 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | AVX2       |       4 |         tg128 |     33.66 ± 0.29 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | AVX2       |       2 |         tg128 |     18.30 ± 0.01 |

| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | Metal      |       8 |         pp512 |    698.13 ± 0.21 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | Metal      |       8 |         tg128 |     68.88 ± 0.24 |

| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | NEON       |       8 |         pp512 |    196.80 ± 0.50 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | NEON       |       8 |         tg128 |     51.58 ± 0.41 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | NEON       |       4 |         tg128 |     30.80 ± 0.03 |
| 1.625 bpw Bitnet | 729.64 MiB |     3.32 B | NEON       |       2 |         tg128 |     16.89 ± 0.01 |

It is still slower than 2 bpw Bitnet, but the difference now is not as
dramatic.
---
 ggml-common.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'ggml-common.h')

diff --git a/ggml-common.h b/ggml-common.h
index bf95da2a..f515e95c 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -380,11 +380,10 @@ static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m bl
 //
 #define QK_IQ1BN 64
 typedef struct {
+    uint8_t ql[12];
     uint8_t extra;
-    uint8_t ql[QK_IQ1BN/8];
-    uint8_t qh[QK_IQ1BN/16];
 } block_iq1_bn;
-static_assert(sizeof(block_iq1_bn) == sizeof(uint8_t) + QK_IQ1BN/8 + QK_IQ1BN/16, "wrong iq1_bn block size/padding");
+static_assert(sizeof(block_iq1_bn) == 13, "wrong iq1_bn block size/padding");
 //
 // Bitnet - implemented as 2.25 bpw
 //
-- 
cgit v1.2.3