summaryrefslogtreecommitdiff
path: root/ggml/src/iqk/iqk_gemm_ktquants.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src/iqk/iqk_gemm_ktquants.cpp')
-rw-r--r--ggml/src/iqk/iqk_gemm_ktquants.cpp14
1 files changed, 7 insertions, 7 deletions
diff --git a/ggml/src/iqk/iqk_gemm_ktquants.cpp b/ggml/src/iqk/iqk_gemm_ktquants.cpp
index 7e895f27..e69e3561 100644
--- a/ggml/src/iqk/iqk_gemm_ktquants.cpp
+++ b/ggml/src/iqk/iqk_gemm_ktquants.cpp
@@ -1615,17 +1615,17 @@ struct Trellis3 {
return result;
}
inline int8x16x2_t next32(const uint16_t * val, uint32_t v0) const {
- auto vka3 = vdupq_n_u32(ka3), vkb3 = vdupq_n_u32(kb3);
+ auto vka3 = vdupq_n_u32(ka3);
int8x16x2_t result = {vdupq_n_s8(-126), vdupq_n_s8(-126)};
int8x16x2_t i8;
for (int i = 0; i < 2; ++i) {
i8.val[0] = vmulq_u32(mka, vdupq_n_u32(val[2*i+0]+v0));
- i8.val[1] = vmlaq_u32(vkb3, vka3, i8.val[0]);
+ i8.val[1] = vmulq_u32(vka3, i8.val[0]);
i8.val[0] = vandq_u32(i8.val[0], vdupq_n_u32(0x3f3f3f3f));
i8.val[1] = vandq_u32(i8.val[1], vdupq_n_u32(0x3f3f3f3f));
auto s1 = vpaddq_s8(vreinterpretq_s8_u32(i8.val[0]), vreinterpretq_s8_u32(i8.val[1]));
i8.val[0] = vmulq_u32(mka, vdupq_n_u32(val[2*i+1]+v0));
- i8.val[1] = vmlaq_u32(vkb3, vka3, i8.val[0]);
+ i8.val[1] = vmulq_u32(vka3, i8.val[0]);
i8.val[0] = vandq_u32(i8.val[0], vdupq_n_u32(0x3f3f3f3f));
i8.val[1] = vandq_u32(i8.val[1], vdupq_n_u32(0x3f3f3f3f));
auto s2 = vpaddq_s8(vreinterpretq_s8_u32(i8.val[0]), vreinterpretq_s8_u32(i8.val[1]));
@@ -1634,11 +1634,11 @@ struct Trellis3 {
return result;
}
inline int8x16x4_t next64(const uint32_t * val) const {
- auto vka3 = vdupq_n_u32(ka3), vkb3 = vdupq_n_u32(kb3);
+ auto vka3 = vdupq_n_u32(ka3);
int8x16x4_t result = {vdupq_n_s8(-126), vdupq_n_s8(-126), vdupq_n_s8(-126), vdupq_n_s8(-126)};
for (int i = 0; i < 2; ++i) {
auto i8_1 = next8(val[4*i+0], val[4*i+1]);
- int8x16x2_t i8_2{vmlaq_u32(vkb3, vka3, i8_1.val[0]), vmlaq_u32(vkb3, vka3, i8_1.val[1])};
+ int8x16x2_t i8_2{vmulq_u32(vka3, i8_1.val[0]), vmulq_u32(vka3, i8_1.val[1])};
i8_1.val[0] = vandq_u32(i8_1.val[0], vdupq_n_u32(0x3f3f3f3f));
i8_1.val[1] = vandq_u32(i8_1.val[1], vdupq_n_u32(0x3f3f3f3f));
i8_2.val[0] = vandq_u32(i8_2.val[0], vdupq_n_u32(0x3f3f3f3f));
@@ -1646,8 +1646,8 @@ struct Trellis3 {
auto s1_1 = vpaddq_s8(vreinterpretq_s8_u32(i8_1.val[0]), vreinterpretq_s8_u32(i8_1.val[1]));
auto s1_2 = vpaddq_s8(vreinterpretq_s8_u32(i8_2.val[0]), vreinterpretq_s8_u32(i8_2.val[1]));
i8_1 = next8(val[4*i+2], val[4*i+3]);
- i8_2.val[0] = vmlaq_u32(vkb3, vka3, i8_1.val[0]);
- i8_2.val[1] = vmlaq_u32(vkb3, vka3, i8_1.val[1]);
+ i8_2.val[0] = vmulq_u32(vka3, i8_1.val[0]);
+ i8_2.val[1] = vmulq_u32(vka3, i8_1.val[1]);
i8_1.val[0] = vandq_u32(i8_1.val[0], vdupq_n_u32(0x3f3f3f3f));
i8_1.val[1] = vandq_u32(i8_1.val[1], vdupq_n_u32(0x3f3f3f3f));
i8_2.val[0] = vandq_u32(i8_2.val[0], vdupq_n_u32(0x3f3f3f3f));