summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ggml/src/ggml-common.h10
-rw-r--r--ggml/src/ggml-cuda/iqk_mmvq.cu31
2 files changed, 39 insertions, 2 deletions
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 2fbac06a..b58e3e4c 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -1956,6 +1956,16 @@ GGML_TABLE_BEGIN(int8_t, iq5nl_values, 64)
-124, -112, -101, -90, -81, -72, -63, -55, -48, -41, -34, -28, -22, -16, -10, -4, 1, 7, 13, 19, 25, 31, 38, 45, 53, 61, 70, 79, 89, 99, 111, 123,
GGML_TABLE_END()
+GGML_TABLE_BEGIN(int8_t, iq6nl_values, 128)
+ -127, -121, -115, -109, -104, -98, -93, -88, -84, -79, -74, -70, -66, -62, -58, -54,
+ -51, -47, -44, -40, -37, -34, -31, -28, -25, -22, -19, -16, -13, -11, -8, -5,
+ -2, 0, 3, 6, 9, 12, 14, 17, 20, 23, 27, 30, 33, 36, 40, 44,
+ 47, 51, 55, 59, 63, 68, 72, 77, 82, 87, 92, 98, 103, 109, 115, 121,
+ -126, -120, -114, -108, -103, -97, -92, -87, -83, -78, -73, -69, -65, -61, -57, -53,
+ -50, -46, -43, -39, -36, -33, -30, -27, -24, -21, -18, -15, -12, -10, -7, -4,
+ -1, 1, 4, 7, 10, 13, 15, 18, 21, 24, 28, 31, 34, 37, 41, 45,
+ 48, 52, 56, 60, 64, 69, 73, 78, 83, 88, 93, 99, 104, 110, 116, 122,
+GGML_TABLE_END()
#endif // GGML_COMMON_IMPL
#endif // GGML_COMMON_IMPL
diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu
index ae5e6a3c..29721cdd 100644
--- a/ggml/src/ggml-cuda/iqk_mmvq.cu
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cu
@@ -254,11 +254,38 @@ __device__ __forceinline__ float vec_dot_iq5_k_q8_1(
#define VDR_IQ6_K_Q8_1_MMVQ 4
#define VDR_IQ6_K_Q8_1_MMQ 4
-// TODO
__device__ __forceinline__ float vec_dot_iq6_k_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
- return 0;
+
+ const block_iq6_k * bq6 = (const block_iq6_k *) vbq + kbx;
+ const uint8_t * all_values = (const uint8_t *)iq6nl_values;
+
+ int i4 = iqs/4; // 0...7. Blocks of 16 index is 4*(i4/2) + (i4%2) + (0 and 2)
+ // Blocks of 32 index is 2*(i4/2) + 0 or 1
+
+ const int32_t * q8_1 = (const int *)bq8_1[2*(i4/2)+0].qs + 4*(i4%2);
+ const int32_t * q8_2 = (const int *)bq8_1[2*(i4/2)+1].qs + 4*(i4%2);
+ const uint32_t * q4 = (const uint32_t *)bq6->qs + 8*(i4/2) + 4*(i4%2);
+ const uint32_t * qh = (const uint32_t *)bq6->qh + 8*(i4/4) + 4*(i4%2);
+ const uint16_t extra = bq6->extra >> (4*(i4/2) + (i4%2));
+ const uint8_t * values1 = all_values + 64*(extra & 1);
+ const uint8_t * values2 = all_values + 16*(extra & 4);
+ uint32_t aux32[2];
+ const uint8_t * a8 = (const uint8_t *)aux32;
+ int v1, v2;
+ int sumi1 = 0, sumi2 = 0;
+ for (int j = 0; j < 4; ++j) {
+ uint32_t h = qh[j] >> 4*((i4/2)%2);
+ aux32[0] = ((q4[j] >> 0) & 0x0f0f0f0f) | ((h << 4) & 0x30303030);
+ aux32[1] = ((q4[j] >> 4) & 0x0f0f0f0f) | ((h << 2) & 0x30303030);
+ v1 = int_from_table(a8+0, values1);
+ v2 = int_from_table(a8+4, values2);
+ sumi1 = ggml_cuda_dp4a(v1, q8_1[j], sumi1);
+ sumi2 = ggml_cuda_dp4a(v2, q8_2[j], sumi2);
+ }
+ const float d6 = __half2float(bq6->d);
+ return d6 * (__low2float(bq8_1[2*(i4/2)+0].ds) * sumi1 * bq6->scales[4*(i4/2)+(i4%2)] + __low2float(bq8_1[2*(i4/2)+1].ds) * sumi2 * bq6->scales[4*(i4/2)+(i4%2)+2]);
}
static const __device__ uint32_t iq2k_table[512] = {