diff options
-rw-r--r-- | ggml/src/ggml-cuda/iqk_mmvq.cu | 36 |
1 files changed, 21 insertions, 15 deletions
diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu index 795243e7..36dbb52a 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cu +++ b/ggml/src/ggml-cuda/iqk_mmvq.cu @@ -633,31 +633,37 @@ __device__ __forceinline__ float vec_dot_iq1_bn_q8_1( float scale = d16; const block_iq1_bn * bq1 = (const block_iq1_bn *)((const char *)vbq + sizeof(d16)) + kbx; - static const uint8_t k_mult[5] = {81, 27, 9, 3, 1}; - // iqs is 0 or 1 int sumi = 0; #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + uint16_t mult[2]; + mult[1] = iqs == 0 ? 27 : 3; + mult[0] = mult[1] + (mult[1] << 1); const int * q8 = (const int *)bq8_1[iqs].qs; int val[4]; for (int l = 0; l < 2; ++l) { int8_t * a = (int8_t *)val; const int i16 = 2*iqs + l; for (int k = 0; k < 3; ++k) { - uint8_t q = bq1->ql[3*i16+k]; - for (int j = 0; j < 5; ++j) { - uint8_t v = k_mult[j]*q; - int8_t vs = 3*v >> 8; //(v + (v >> 1)) >> 7; - *a++ = vs-1; + uint16_t q = bq1->ql[3*i16+k]; + for (int j = 4; j >= 0; --j) { + uint16_t v = q & 0xff; + v += v << 1; + a[j] = v >> 8; + q += q << 1; } + a += 5; } - uint8_t v = k_mult[i16]*bq1->extra; - int8_t vs = 3*v >> 8; //(v + (v >> 1)) >> 7; - *a++ = vs-1; + uint16_t v = (mult[l]*bq1->extra) & 0xff; + v += v << 1; + *a = v >> 8; sumi = __dp4a(val[0], q8[4*l+0], __dp4a(val[1], q8[4*l+1], __dp4a(val[2], q8[4*l+2], __dp4a(val[3], q8[4*l+3], sumi)))); } + float2 d8 = __half22float2(bq8_1[iqs].ds); + return scale * (d8.x * sumi - d8.y); #else + static const uint16_t k_mult[5] = {81, 27, 9, 3, 1}; const int8_t * q8 = bq8_1[iqs].qs; for (int l = 0; l < 2; ++l) { const int i16 = 2*iqs + l; @@ -675,8 +681,8 @@ __device__ __forceinline__ float vec_dot_iq1_bn_q8_1( sumi += q8[0]*(vs - 1); q8++; } -#endif return scale * __low2float(bq8_1[iqs].ds) * sumi; +#endif } __device__ __forceinline__ float vec_dot_iq2_bn_q8_1( @@ -688,13 +694,13 @@ __device__ __forceinline__ float vec_dot_iq2_bn_q8_1( // iqs is 0 or 1 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - auto qs = (const uint16_t *)bq2->qs + 4*iqs; + auto qs = (const int *)bq2->qs + 2*iqs; auto q8l = (const int *)bq8_1[0].qs + 2*iqs; auto q8h = (const int *)bq8_1[1].qs + 2*iqs; int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; for (int j = 0; j < 2; ++j) { - int vl = qs[2*j+0] | (uint32_t(qs[2*j+1]) << 16); - int vh = vl >> 4; + int vl = qs[j]; + int vh = qs[j] >> 4; sumi1 = __dp4a(vl & 0x03030303, q8l[j+0], sumi1); sumi2 = __dp4a(vl & 0x0c0c0c0c, q8l[j+4], sumi2); sumi3 = __dp4a(vh & 0x03030303, q8h[j+0], sumi3); @@ -702,6 +708,7 @@ __device__ __forceinline__ float vec_dot_iq2_bn_q8_1( } auto d8l = __half22float2(bq8_1[0].ds); auto d8h = __half22float2(bq8_1[1].ds); + return scale * (d8l.x * (sumi1 + 0.25f*sumi2) + d8h.x * (sumi3 + 0.25f * sumi4) - 0.5f*d8l.y - 0.5f*d8h.y); #else int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; auto q8l = bq8_1[0].qs + 8*iqs; @@ -717,7 +724,6 @@ __device__ __forceinline__ float vec_dot_iq2_bn_q8_1( auto d8h = __half22float2(bq8_1[1].ds); return scale * (d8l.x * (sumi1 + 0.25f*sumi2) + 0.0625f * d8h.x*(sumi3 + 0.25f*sumi4) - 0.5f*d8l.y - 0.5f*d8h.y); #endif - return scale * (d8l.x * (sumi1 + 0.25f*sumi2) + d8h.x * (sumi3 + 0.25f * sumi4) - 0.5f*d8l.y - 0.5f*d8h.y); } } // namespace |