summaryrefslogtreecommitdiff
path: root/ggml-cuda/vecdotq.cuh
diff options
context:
space:
mode:
Diffstat (limited to 'ggml-cuda/vecdotq.cuh')
-rw-r--r--ggml-cuda/vecdotq.cuh13
1 files changed, 3 insertions, 10 deletions
diff --git a/ggml-cuda/vecdotq.cuh b/ggml-cuda/vecdotq.cuh
index acab3865..6ec2035a 100644
--- a/ggml-cuda/vecdotq.cuh
+++ b/ggml-cuda/vecdotq.cuh
@@ -1086,23 +1086,16 @@ static __device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
int sumi = 0;
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const int * q8 = (const int *)bq8_1[iqs].qs;
- //const int minus = 0xffffffff;
for (int l = 0; l < 4; ++l) {
- int sign = extra & (1 << l) ? -1 : 1;
uint16_t val = iq1bn_grid_xxx[bq1->ql[4*iqs + l] | ((bq1->qh[2*iqs + l/2] << (8 - 4*(l%2))) & 0x0f00)];
uint8_t vp = val & 0xff, vm = val >> 8;
int32_t vp1 = __vcmpeq4(((vp & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
int32_t vp2 = __vcmpeq4(((vp >> 4) * 0x01010101) & 0x08040201, 0x08040201);
int32_t vm1 = __vcmpeq4(((vm & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
int32_t vm2 = __vcmpeq4(((vm >> 4) * 0x01010101) & 0x08040201, 0x08040201);
- sumi += (__dp4a(q8[2*l+0], vm1, __dp4a(q8[2*l+1], vm2, 0)) - __dp4a(q8[2*l+0], vp1, __dp4a(q8[2*l+1], vp2, 0)))*sign;
- //int32_t vp1 = __vcmpeq4(((vp & 0xf) * 0x01010101) & 0x08040201, 0x08040201) & q8[2*l+0];
- //int32_t vp2 = __vcmpeq4(((vp >> 4) * 0x01010101) & 0x08040201, 0x08040201) & q8[2*l+1];
- //int32_t vm1 = __vcmpeq4(((vm & 0xf) * 0x01010101) & 0x08040201, 0x08040201) & q8[2*l+0];
- //int32_t vm2 = __vcmpeq4(((vm >> 4) * 0x01010101) & 0x08040201, 0x08040201) & q8[2*l+1];
- //int32_t v1 = __vsubss4(vp1, vm1);
- //int32_t v2 = __vsubss4(vp2, vm2);
- //sumi += __dp4a(v1, 0x01010101, __dp4a(v2, 0x01010101, 0))*sign;
+ int32_t pm = __dp4a(q8[2*l+0], vm1, __dp4a(q8[2*l+1], vm2, 0));
+ int32_t pp = __dp4a(q8[2*l+0], vp1, __dp4a(q8[2*l+1], vp2, 0));
+ sumi += extra & (1 << l) ? pp - pm : pm - pp;
}
#else
const int8_t * q8 = bq8_1[iqs].qs;