From aa14a06b44ff12be7e4461a6e169a657275a5b20 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 25 Jun 2024 11:32:48 +0300 Subject: Bitnet: trying an alternative iq1_bn grid Faster on CUDA. The scalar version is faster too. The issue with CUDA is that now I see wild performance fluctuations. Running llama-bench I can get 220 t/s for TG-128 one time, and 190 t/s another time, with uncertaintiers of 1-2 t/s. Same for PP, results are jumping back-and-fort between ~9500 t/s and ~8900 t/s. So, basically no reliable measurement at this point, but for sure faster than the previous version, which was at around 170-180 t/s. --- iqk_mul_mat.cpp | 1 + 1 file changed, 1 insertion(+) (limited to 'iqk_mul_mat.cpp') diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp index 1e195ec2..907b0d19 100644 --- a/iqk_mul_mat.cpp +++ b/iqk_mul_mat.cpp @@ -2788,6 +2788,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { MulMat::set_functions(mm); break; case GGML_TYPE_IQ1_BN: + return false; assert (ne00 % QK_IQ1BN == 0); mm.funcs[0] = mul_mat_iq1bn_q8_K64<1>; mm.funcs[1] = mul_mat_iq1bn_q8_K64<2>; -- cgit v1.2.3