iqk_mul_mat: improve iq1_bn (bitnet) on vanilla AVX2

I now get PP-512 = 270 t/s on the Ryzen-5975WX
author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-17 08:24:51 +0300
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2024-06-22 12:02:51 +0300
commit: 29d9bf65f326e3215a87a24d85656606c6265702 (patch)
tree: 7ff630a145c7aa06da2a6597645fba010d64dfad
parent: 91ec824f2de3a073551ab8c5c19672d44f59b676 (diff)
1 files changed, 1 insertions, 3 deletions
diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp
index c204b22c..f38163d5 100644
--- a/iqk_mul_mat.cpp
+++ b/iqk_mul_mat.cpp
@@ -1383,9 +1383,7 @@ static void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const DataIn
 #if defined __AVX512VNNI__ && defined __AVX512VL__
                 auto dot = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), m1_8, dot1), m1_8, dot2);
 #else
-                dot1 = _mm256_madd_epi16(m1_16, _mm256_maddubs_epi16(m1_8, dot1));
-                dot2 = _mm256_madd_epi16(m1_16, _mm256_maddubs_epi16(m1_8, dot2));
-                auto dot = _mm256_add_epi32(_mm256_add_epi32(dot1, dot2));
+                auto dot = _mm256_madd_epi16(m1_16, _mm256_add_epi16(_mm256_maddubs_epi16(m1_8, dot1), _mm256_maddubs_epi16(m1_8, dot2)));
 #endif
                 accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(q8.scale(iy, i)), _mm256_cvtepi32_ps(dot), accd[iy]);
             }
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-17 08:24:51 +0300
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-06-22 12:02:51 +0300
commit	29d9bf65f326e3215a87a24d85656606c6265702 (patch)
tree	7ff630a145c7aa06da2a6597645fba010d64dfad
parent	91ec824f2de3a073551ab8c5c19672d44f59b676 (diff)