summaryrefslogtreecommitdiff
path: root/ggml-sycl.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-05-23 10:00:21 +0300
committerGitHub <noreply@github.com>2024-05-23 10:00:21 +0300
commite84b71c2c6da6e69c8f815168ea836f9716a325e (patch)
treeaa5c046cc5f0e9e953de5329412a753d82e60589 /ggml-sycl.cpp
parent1b1e27cb49158123ef4902aa41eb368c9e76e6a1 (diff)
ggml : drop support for QK_K=64 (#7473)
* ggml : drop support for QK_K=64 ggml-ci * opencl : restore QK_K=256 define
Diffstat (limited to 'ggml-sycl.cpp')
-rw-r--r--ggml-sycl.cpp472
1 files changed, 2 insertions, 470 deletions
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index f486b6c0..496ec61c 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
const block_q2_K * x = (const block_q2_K *) vx;
const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
const int n = tid/32;
const int l = tid - 32*n;
const int is = 8*n + l/16;
@@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-#else
- const int is = tid/16; // 0 or 1
- const int il = tid%16; // 0...15
- const uint8_t q = x[i].qs[il] >> (2*is);
- dst_t * y = yy + i*QK_K + 16*is + il;
-
- float dall = x[i].dm[0];
- float dmin = x[i].dm[1];
- y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
- y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
-#endif
-
}
template<typename dst_t>
@@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
const int i = item_ct1.get_group(2);
const block_q3_K * x = (const block_q3_K *) vx;
-#if QK_K == 256
const int r = item_ct1.get_local_id(2) / 4;
const int tid = r/2;
const int is0 = r%2;
@@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
const uint8_t * hm = x[i].hmask;
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-#else
- const int tid = item_ct1.get_local_id(2);
- const int is = tid/16; // 0 or 1
- const int il = tid%16; // 0...15
- const int im = il/8; // 0...1
- const int in = il%8; // 0...7
-
- dst_t * y = yy + i*QK_K + 16*is + il;
-
- const uint8_t q = x[i].qs[il] >> (2*is);
- const uint8_t h = x[i].hmask[in] >> (2*is + im);
- const float d = (float)x[i].d;
-
- if (is == 0) {
- y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
- y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
- } else {
- y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
- y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
- }
-#endif
-
}
-#if QK_K == 256
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
if (j < 4) {
d = q[j] & 63; m = q[j + 4] & 63;
@@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
}
}
-#endif
template<typename dst_t>
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
@@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
const int i = item_ct1.get_group(2);
-#if QK_K == 256
// assume 32 threads
const int tid = item_ct1.get_local_id(2);
const int il = tid/8;
@@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
y[l + 0] = d1 * (q[l] & 0xF) - m1;
y[l +32] = d2 * (q[l] >> 4) - m2;
}
-#else
- const int tid = item_ct1.get_local_id(2);
- const uint8_t * q = x[i].qs;
- dst_t * y = yy + i*QK_K;
- const float d = (float)x[i].dm[0];
- const float m = (float)x[i].dm[1];
- y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
- y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
-#endif
}
template<typename dst_t>
@@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
const int i = item_ct1.get_group(2);
-#if QK_K == 256
// assume 64 threads - this is very slightly better than the one below
const int tid = item_ct1.get_local_id(2);
const int il = tid/16; // il is in 0...3
@@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
hm <<= 1;
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-#else
- const int tid = item_ct1.get_local_id(2);
- const uint8_t q = x[i].qs[tid];
- const int im = tid/8; // 0...3
- const int in = tid%8; // 0...7
- const int is = tid/16; // 0 or 1
- const uint8_t h = x[i].qh[in] >> im;
- const float d = x[i].d;
- dst_t * y = yy + i*QK_K + tid;
- y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
- y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
-#endif
}
template<typename dst_t>
@@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
const block_q6_K * x = (const block_q6_K *) vx;
const int i = item_ct1.get_group(2);
-#if QK_K == 256
// assume 64 threads - this is very slightly better than the one below
const int tid = item_ct1.get_local_id(2);
@@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-#else
-
- // assume 32 threads
- const int tid = item_ct1.get_local_id(2);
- const int ip = tid/16; // 0 or 1
- const int il = tid - 16*ip; // 0...15
-
- dst_t * y = yy + i*QK_K + 16*ip + il;
-
- const float d = x[i].d;
-
- const uint8_t ql = x[i].ql[16*ip + il];
- const uint8_t qh = x[i].qh[il] >> (2*ip);
- const int8_t * sc = x[i].scales;
-
- y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
- y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-#endif
}
template<typename dst_t>
@@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
-#else
- assert(false);
-#endif
-
}
template<typename dst_t>
@@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
const block_iq2_xs * x = (const block_iq2_xs *) vx;
const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
- assert(false);
-#endif
-
}
template <typename dst_t>
@@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
const block_iq2_s * x = (const block_iq2_s *) vx;
const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
#pragma unroll
- for (int j = 0; j < 8; ++j)
+ for (int j = 0; j < 8; ++j) {
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
- assert(false);
-
-#endif
-
+ }
}
template<typename dst_t>
@@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
}
-#else
- assert(false);
-#endif
-
}
template <typename dst_t>
@@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
const block_iq3_s * x = (const block_iq3_s *) vx;
const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
}
-#else
- assert(false);
-#endif
-
}
template <typename dst_t>
@@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
const block_iq1_s * x = (const block_iq1_s *) vx;
const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
for (int j = 0; j < 8; ++j) {
y[j] = d * (q[j] + delta);
}
-#else
- assert(false);
-#endif
-
}
template <typename dst_t>
@@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
const block_iq1_m * x = (const block_iq1_m *) vx;
const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
for (int j = 0; j < 8; ++j) {
y[j] = d * (q[j] + delta);
}
-#else
- assert(false);
-#endif
-
}
template <typename dst_t>
@@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
float tmp = 0; // partial sum for thread in warp
-#if QK_K == 256
const int tid =
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
const int ix =
@@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
tmp += dall * sum1 - dmin * sum2;
}
-#else
- const int tid = item_ct1.get_local_id(2) /
- (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
- const int ix = item_ct1.get_local_id(2) %
- (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
- const int offset = tid * K_QUANTS_PER_ITERATION;
-
- uint32_t uaux[2];
- const uint8_t * d = (const uint8_t *)uaux;
-
-
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
- const float * y = yy + i * QK_K + offset;
- const uint8_t * q = x[i].qs + offset;
- const uint32_t * s = (const uint32_t *)x[i].scales;
-
- uaux[0] = s[0] & 0x0f0f0f0f;
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-
- const sycl::float2 dall =
- x[i].dm.convert<float, sycl::rounding_mode::automatic>();
-
- float sum1 = 0, sum2 = 0;
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
- const uint8_t ql = q[l];
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
- + y[l+16] * d[1] * ((ql >> 2) & 3)
- + y[l+32] * d[2] * ((ql >> 4) & 3)
- + y[l+48] * d[3] * ((ql >> 6) & 3);
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
- }
- tmp += dall.x() * sum1 - dall.y() * sum2;
- }
-
-#endif
// sum up partial sums and write back result
#pragma unroll
@@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
float tmp = 0; // partial sum for thread in warp
-#if QK_K == 256
-
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
@@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
tmp += d * sum;
}
-#else
-
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
- const int in = offset/8; // 0 or 1
- const int im = offset%8; // 0...7
-
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
- const float * y = yy + i * QK_K + offset;
- const uint8_t * q = x[i].qs + offset;
- const uint8_t * s = x[i].scales;
-
- const float dall = (float)x[i].d;
-
- float sum = 0;
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
- const uint8_t hl = x[i].hmask[im+l] >> in;
- const uint8_t ql = q[l];
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
- }
- tmp += sum;
- }
-#endif
// sum up partial sums and write back result
#pragma unroll
@@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
const block_q4_K * x = (const block_q4_K *)vx + ib0;
-#if QK_K == 256
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
@@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
#endif
}
-#else
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
-
- const int step = tid * K_QUANTS_PER_ITERATION;
-
- uint16_t aux16[2];
- const uint8_t * s = (const uint8_t *)aux16;
-
- float tmp = 0;
-
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
- const uint8_t * q = x[i].qs + step;
- const float * y = yy + i*QK_K + step;
- const uint16_t * a = (const uint16_t *)x[i].scales;
- aux16[0] = a[0] & 0x0f0f;
- aux16[1] = (a[0] >> 4) & 0x0f0f;
- const float d = (float)x[i].dm[0];
- const float m = (float)x[i].dm[1];
- float sum = 0.f;
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
- }
- tmp += sum;
- }
-
-#endif
// sum up partial sums and write back result
#pragma unroll
@@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
float tmp = 0; // partial sum for thread in warp
-#if QK_K == 256
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
@@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
dmin * smin;
}
-#else
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
- const int step = tid * K_QUANTS_PER_ITERATION;
- const int im = step/8;
- const int in = step%8;
-
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
- const uint8_t * q = x[i].qs + step;
- const int8_t * s = x[i].scales;
- const float * y = yy + i*QK_K + step;
- const float d = x[i].d;
- float sum = 0.f;
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
- const uint8_t h = x[i].qh[in+j] >> im;
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
- }
- tmp += sum;
- }
-#endif
-
// sum up partial sums and write back result
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
@@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
const block_q6_K * x = (const block_q6_K *)vx + ib0;
-#if QK_K == 256
-
const int tid =
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix =
@@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
}
-#else
-
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
-
- const int step = tid * K_QUANTS_PER_ITERATION;
-
- float tmp = 0; // partial sum for thread in warp
-
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
- const float * y = yy + i * QK_K + step;
- const uint8_t * ql = x[i].ql + step;
- const uint8_t * qh = x[i].qh + step;
- const int8_t * s = x[i].scales;
-
- const float d = x[i+0].d;
-
- float sum = 0;
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
- }
- tmp += sum;
-
- }
-
-#endif
-
// sum up partial sums and write back result
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
@@ -6857,7 +6586,6 @@ static __dpct_inline__ float
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-#ifndef GGML_QKK_64
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
int v[2];
@@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
}
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-
-#else
-
-#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
- float sumf_d = 0.0f;
- float sumf_m = 0.0f;
-
- uint16_t aux16[2];
- const uint8_t * s = (const uint8_t *)aux16;
-
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
- aux16[0] = a[0] & 0x0f0f;
- aux16[1] = (a[0] >> 4) & 0x0f0f;
-
- const float dall = bq4_K->dm[0];
- const float dmin = bq4_K->dm[1];
-
- const float d8_1 = bq8_1[0].ds[0];
- const float d8_2 = bq8_1[1].ds[1];
-
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
- const int * q4 = (const int *)bq4_K->qs + (iqs/2);
- const int v1 = q4[0];
- const int v2 = q4[4];
-
- const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
- const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
- const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
- const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
-
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
-
- return dall * sumf_d - dmin * sumf_m;
-
-#else
- bad_arch();
-#endif // __SYCL_ARCH__ >= VER_4VEC
-
-#endif
}
template <int mmq_y>
@@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-#if QK_K == 256
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-#else
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
-#endif
}
#pragma unroll
@@ -7050,7 +6728,6 @@ static __dpct_inline__ float
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-#ifndef GGML_QKK_64
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
int vl[2];
@@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
}
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-
-#else
-
-#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
- const int8_t * s = bq5_K->scales;
-
- const float d = bq5_K->d;
-
- const float d8_1 = bq8_1[0].ds[0];
- const float d8_2 = bq8_1[1].ds[1];
-
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
- const int * ql = (const int *)bq5_K->qs + (iqs/2);
- const int vl1 = ql[0];
- const int vl2 = ql[4];
-
- const int step = 4 * (iqs/2); // 0, 4, 8, 12
- const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
- const int in = step%8; // 0, 4, 0, 4
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-
- const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
- + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
-
- return d * sumf_d;
-
-#else
- bad_arch();
-#endif // __SYCL_ARCH__ >= VER_4VEC
-
-#endif
}
template <int mmq_y>
@@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-#if QK_K == 256
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-#endif
}
#pragma unroll
@@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
const uint8_t *kmask_iq2xs) {
-#if QK_K == 256
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
#if QR2_XXS == 8
@@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
}
return d * (sumi1 + sumi2);
#endif
-#else
- assert(false);
- return 0.f;
-#endif
}
static __dpct_inline__ float
@@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
#if DPCT_COMPATIBILITY_TEMP >= \
MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
const int ib32 = iqs;
@@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
assert(false);
return 0.f;
#endif
-#else
- assert(false);
- return 0.f;
-#endif
}
static __dpct_inline__ float
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-#if QK_K == 256
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
const int ib32 = iqs;
@@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
}
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-#else
- assert(false);
-#endif
}
static __dpct_inline__ float
@@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
#if DPCT_COMPATIBILITY_TEMP >= \
MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
const int ib32 = iqs;
@@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
assert(false);
return 0.f;
#endif
-#else
- assert(false);
- return 0.f;
-#endif
}
static __dpct_inline__ float
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
const uint32_t *iq3s_grid) {
-#if QK_K == 256
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
const int ib32 = iqs;
@@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
bq8_1[ib32].ds[0];
return d * sumi;
-#else
- assert(false);
-#endif
}
static __dpct_inline__ float
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
const uint32_t *iq1s_grid_gpu) {
-#if QK_K == 256
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
const int ib32 = iqs;
@@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
const float d = d1q * bq8_1[ib32].ds[0];
const float m = d1q * bq8_1[ib32].ds[1];
return d * sumi + m * delta;
-#else
- assert(false);
-#endif
}
static __dpct_inline__ float
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-#if QK_K == 256
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
const int ib32 = iqs;
@@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
-#else
- assert(false);
-#endif
}
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
@@ -7720,7 +7322,6 @@ static __dpct_inline__ float
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-#if QK_K == 256
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
@@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
}
return d * (sumi1 + sumi2);
-#else
- assert(false);
-#endif
}
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -10203,7 +9801,6 @@ template <typename dst_t>
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
-#if QK_K == 256
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
dequantize_block_q2_K(vx, y, item_ct1);
});
}
-#else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
-
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q2_K(vx, y, item_ct1);
- });
- }
-
-#endif
}
template <typename dst_t>
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
-#if QK_K == 256
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
dequantize_block_q3_K(vx, y, item_ct1);
});
}
-#else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
-
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q3_K(vx, y, item_ct1);
- });
- }
-#endif
}
template <typename dst_t>
@@ -10320,7 +9889,6 @@ template <typename dst_t>
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
-#if QK_K == 256
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
dequantize_block_q5_K(vx, y, item_ct1);
});
}
-#else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
-
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q5_K(vx, y, item_ct1);
- });
- }
-
-#endif
}
template <typename dst_t>
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
-#if QK_K == 256
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
dequantize_block_q6_K(vx, y, item_ct1);
});
}
-#else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
-
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q6_K(vx, y, item_ct1);
- });
- }
-
-#endif
}
template <typename dst_t>
@@ -10529,9 +10068,6 @@ template <typename dst_t>
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) {
const int nb = (k + QK_K - 1) / QK_K;
-#if QK_K == 64
- dequantize_row_iq4_nl_sycl(vx, y, k, stream);
-#else
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
});
});
}
-#endif
}
@@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
const int nrows_y, const int nrows_dst,
dpct::queue_ptr stream) try {
-#if QK_K == 256
-
int id;
SYCL_CHECK(
CHECK_TRY_ERROR(id = get_current_device_id()));
@@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
});
}
}
-#endif
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__