iq2_bn_r4: fastest Bitnet CPU implementation on the planet (#124)

* Adding iq2_bn_r4 This Zen4-only implementation achieves PP-512 = 826 t/s (!!!) for Bitnet-1.58b-3B, up from 620 t/s for iq2_bn. * Make sure rows per thread are a multiple of the number of interleaved rows With this I can run iq2_bn_r4 with 32 threads and this increases PP-512 to 872 t/s. * iq2_bn_r4: 1st shot at NEON PP-512 is already faster than iq2_bn (284 t/s vs 246 t/s for Bitnet-1.58b-3B). TG-128 is ~5% slower. * iq2_bn_r4: NEON PP-512 is now 296 t/s. TG-128 is ~20% faster than iq2_bn for 1 thread, but saturates to about the same 93 t/s at 8 threads. * iq2_bn_r4: Experimenting on NEON The matrix x vvector multiplication is erratic. iq2_bn_r4 is faster at 1, 2, and 4 threads, but saturates to a lower t/s at 8 threads compared to iq2_bn. iq2_bn actually manages 99 t/s at 8 threads and not 93 as I wrore in the last commit. iq2_bn_r4 performance has huge fluctuations at 4 and 8 threads. * Some cleanup * iq2_bn_r4: AVX2 As expected, PP is slightly slower as we just don;t have enough vector registers (690 vs 710 t/s). TG is slightly faster (18.2 vs 16.7 t/s at 1 thread). * iq2_bn_r4: use AVX2 implementation on Zen4 for matrix x vector It is faster - we get 29.6 t/s at 1 thread vs 25.9 t/s for iq2_bn. * iq2_bn_r4: simdify q8_K16 quantization (AVX2) PP-512 becomes 834 t/s and TG-128 now saturates to the same performance as iq2_bn for 4 threads. * iq2_bn_r4: simdify q8_K16 quantization (NEON) PP-512 is now 304.7 t/s, and TG-128 @ 8 threads very slightly outperforms iq2_bn (100.7 t/s vs 99.6 t/s) * iq2_bn_r4: fix AVX2 after breaking it two commits ago * iq2_bn_r4: better AVX2 As we don't have enough vector registers on AVX2, it is better to do two passes per row needing only half of the accumulator registers that way. With this, we now beat iq2_bn PP also on AVX2 by a small margin. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2024-12-06 12:15:39 +0100
committer: GitHub <noreply@github.com> 2024-12-06 12:15:39 +0100
commit: 3682e4700db6b8cb2ca8e3da365578078f21ab0c (patch)
tree: ea1680494ca00580b0a038cdef035c596e80e58c /ggml/src/ggml-quants.h
parent: f64de08203aaee95ca755336de3e1db85d990198 (diff)
1 files changed, 0 insertions, 7 deletions
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index a40a6d37..b6d69011 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -33,7 +33,6 @@ void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_REST
 void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
 
 void quantize_row_iq2_xxs_ref(const float * GGML_RESTRICT x, block_iq2_xxs * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_xs_ref (const float * GGML_RESTRICT x, block_iq2_xs  * GGML_RESTRICT y, int64_t k);
@@ -43,7 +42,6 @@ void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs  * GGM
 void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn  * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_bn_ref (const float * GGML_RESTRICT x, block_iq2_bn  * GGML_RESTRICT y, int64_t k);
 
 void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -59,7 +57,6 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
 void quantize_row_iq2_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -69,7 +66,6 @@ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
 void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -97,7 +93,6 @@ void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_
 void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq1_bn (const block_iq1_bn  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq2_bn (const block_iq2_bn  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 
 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -123,7 +118,6 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
 void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@@ -136,7 +130,6 @@ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT ds
 size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq1_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq2_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
author	Kawrakow <iwankawrakow@gmail.com>	2024-12-06 12:15:39 +0100
committer	GitHub <noreply@github.com>	2024-12-06 12:15:39 +0100
commit	3682e4700db6b8cb2ca8e3da365578078f21ab0c (patch)
tree	ea1680494ca00580b0a038cdef035c596e80e58c /ggml/src/ggml-quants.h
parent	f64de08203aaee95ca755336de3e1db85d990198 (diff)