diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-05-23 10:00:21 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-23 10:00:21 +0300 |
commit | e84b71c2c6da6e69c8f815168ea836f9716a325e (patch) | |
tree | aa5c046cc5f0e9e953de5329412a753d82e60589 /ggml-sycl.cpp | |
parent | 1b1e27cb49158123ef4902aa41eb368c9e76e6a1 (diff) |
ggml : drop support for QK_K=64 (#7473)
* ggml : drop support for QK_K=64
ggml-ci
* opencl : restore QK_K=256 define
Diffstat (limited to 'ggml-sycl.cpp')
-rw-r--r-- | ggml-sycl.cpp | 472 |
1 files changed, 2 insertions, 470 deletions
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index f486b6c0..496ec61c 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri const block_q2_K * x = (const block_q2_K *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int n = tid/32; const int l = tid - 32*n; const int is = 8*n + l/16; @@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); -#else - const int is = tid/16; // 0 or 1 - const int il = tid%16; // 0...15 - const uint8_t q = x[i].qs[il] >> (2*is); - dst_t * y = yy + i*QK_K + 16*is + il; - - float dall = x[i].dm[0]; - float dmin = x[i].dm[1]; - y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); - y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); -#endif - } template<typename dst_t> @@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri const int i = item_ct1.get_group(2); const block_q3_K * x = (const block_q3_K *) vx; -#if QK_K == 256 const int r = item_ct1.get_local_id(2) / 4; const int tid = r/2; const int is0 = r%2; @@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri const uint8_t * hm = x[i].hmask; for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); -#else - const int tid = item_ct1.get_local_id(2); - const int is = tid/16; // 0 or 1 - const int il = tid%16; // 0...15 - const int im = il/8; // 0...1 - const int in = il%8; // 0...7 - - dst_t * y = yy + i*QK_K + 16*is + il; - - const uint8_t q = x[i].qs[il] >> (2*is); - const uint8_t h = x[i].hmask[in] >> (2*is + im); - const float d = (float)x[i].d; - - if (is == 0) { - y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } else { - y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } -#endif - } -#if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { d = q[j] & 63; m = q[j + 4] & 63; @@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8 m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); } } -#endif template<typename dst_t> static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, @@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri const int i = item_ct1.get_group(2); -#if QK_K == 256 // assume 32 threads const int tid = item_ct1.get_local_id(2); const int il = tid/8; @@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri y[l + 0] = d1 * (q[l] & 0xF) - m1; y[l +32] = d2 * (q[l] >> 4) - m2; } -#else - const int tid = item_ct1.get_local_id(2); - const uint8_t * q = x[i].qs; - dst_t * y = yy + i*QK_K; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); - y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); -#endif } template<typename dst_t> @@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri const int i = item_ct1.get_group(2); -#if QK_K == 256 // assume 64 threads - this is very slightly better than the one below const int tid = item_ct1.get_local_id(2); const int il = tid/16; // il is in 0...3 @@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri hm <<= 1; y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; -#else - const int tid = item_ct1.get_local_id(2); - const uint8_t q = x[i].qs[tid]; - const int im = tid/8; // 0...3 - const int in = tid%8; // 0...7 - const int is = tid/16; // 0 or 1 - const uint8_t h = x[i].qh[in] >> im; - const float d = x[i].d; - dst_t * y = yy + i*QK_K + tid; - y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); - y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); -#endif } template<typename dst_t> @@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri const block_q6_K * x = (const block_q6_K *) vx; const int i = item_ct1.get_group(2); -#if QK_K == 256 // assume 64 threads - this is very slightly better than the one below const int tid = item_ct1.get_local_id(2); @@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -#else - - // assume 32 threads - const int tid = item_ct1.get_local_id(2); - const int ip = tid/16; // 0 or 1 - const int il = tid - 16*ip; // 0...15 - - dst_t * y = yy + i*QK_K + 16*ip + il; - - const float d = x[i].d; - - const uint8_t ql = x[i].ql[16*ip + il]; - const uint8_t qh = x[i].qh[il] >> (2*ip); - const int8_t * sc = x[i].scales; - - y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); -#endif } template<typename dst_t> @@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res const block_iq2_xxs * x = (const block_iq2_xxs *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f); -#else - assert(false); -#endif - } template<typename dst_t> @@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest const block_iq2_xs * x = (const block_iq2_xs *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); -#endif - } template <typename dst_t> @@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, const block_iq2_s * x = (const block_iq2_s *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; #pragma unroll - for (int j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); - -#endif - + } } template<typename dst_t> @@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res const block_iq3_xxs * x = (const block_iq3_xxs *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } -#else - assert(false); -#endif - } template <typename dst_t> @@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, const block_iq3_s * x = (const block_iq3_s *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } -#else - assert(false); -#endif - } template <typename dst_t> @@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, const block_iq1_s * x = (const block_iq1_s *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, for (int j = 0; j < 8; ++j) { y[j] = d * (q[j] + delta); } -#else - assert(false); -#endif - } template <typename dst_t> @@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, const block_iq1_m * x = (const block_iq1_m *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, for (int j = 0; j < 8; ++j) { y[j] = d * (q[j] + delta); } -#else - assert(false); -#endif - } template <typename dst_t> @@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 const int tid = item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 const int ix = @@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, tmp += dall * sum1 - dmin * sum2; } -#else - const int tid = item_ct1.get_local_id(2) / - (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = item_ct1.get_local_id(2) % - (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; - - uint32_t uaux[2]; - const uint8_t * d = (const uint8_t *)uaux; - - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint32_t * s = (const uint32_t *)x[i].scales; - - uaux[0] = s[0] & 0x0f0f0f0f; - uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; - - const sycl::float2 dall = - x[i].dm.convert<float, sycl::rounding_mode::automatic>(); - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t ql = q[l]; - sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) - + y[l+16] * d[1] * ((ql >> 2) & 3) - + y[l+32] * d[2] * ((ql >> 4) & 3) - + y[l+48] * d[3] * ((ql >> 6) & 3); - sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; - } - tmp += dall.x() * sum1 - dall.y() * sum2; - } - -#endif // sum up partial sums and write back result #pragma unroll @@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 - const uint16_t kmask1 = 0x0303; const uint16_t kmask2 = 0x0f0f; @@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, tmp += d * sum; } -#else - - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 - const int in = offset/8; // 0 or 1 - const int im = offset%8; // 0...7 - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint8_t * s = x[i].scales; - - const float dall = (float)x[i].d; - - float sum = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t hl = x[i].hmask[im+l] >> in; - const uint8_t ql = q[l]; - sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) - + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) - + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) - + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); - } - tmp += sum; - } -#endif // sum up partial sums and write back result #pragma unroll @@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const block_q4_K * x = (const block_q4_K *)vx + ib0; -#if QK_K == 256 const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; @@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, #endif } -#else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); - - const int step = tid * K_QUANTS_PER_ITERATION; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - float tmp = 0; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const float * y = yy + i*QK_K + step; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) - + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) - + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) - + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); - } - tmp += sum; - } - -#endif // sum up partial sums and write back result #pragma unroll @@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; @@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, dmin * smin; } -#else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); - const int step = tid * K_QUANTS_PER_ITERATION; - const int im = step/8; - const int in = step%8; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const int8_t * s = x[i].scales; - const float * y = yy + i*QK_K + step; - const float d = x[i].d; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - const uint8_t h = x[i].qh[in+j] >> im; - sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) - + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) - + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) - + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); - } - tmp += sum; - } -#endif - // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { @@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa const block_q6_K * x = (const block_q6_K *)vx + ib0; -#if QK_K == 256 - const int tid = item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int ix = @@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa } -#else - - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3 - - const int step = tid * K_QUANTS_PER_ITERATION; - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + step; - const uint8_t * ql = x[i].ql + step; - const uint8_t * qh = x[i].qh + step; - const int8_t * s = x[i].scales; - - const float d = x[i+0].d; - - float sum = 0; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) - + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) - + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) - + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); - } - tmp += sum; - - } - -#endif - // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { @@ -6857,7 +6586,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#ifndef GGML_QKK_64 const block_q4_K * bq4_K = (const block_q4_K *) vbq; int v[2]; @@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq, } return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); - -#else - -#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics - const block_q4_K * bq4_K = (const block_q4_K *) vbq; - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - const uint16_t * a = (const uint16_t *)bq4_K->scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - const float dall = bq4_K->dm[0]; - const float dmin = bq4_K->dm[1]; - - const float d8_1 = bq8_1[0].ds[0]; - const float d8_2 = bq8_1[1].ds[1]; - - const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); - const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); - const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); - const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - - const int * q4 = (const int *)bq4_K->qs + (iqs/2); - const int v1 = q4[0]; - const int v2 = q4[4]; - - const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0)); - const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); - const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0)); - const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0)); - - sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); - sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); - - return dall * sumf_d - dmin * sumf_m; - -#else - bad_arch(); -#endif // __SYCL_ARCH__ >= VER_4VEC - -#endif } template <int mmq_y> @@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql, const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; -#if QK_K == 256 x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; -#else - x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; -#endif } #pragma unroll @@ -7050,7 +6728,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#ifndef GGML_QKK_64 const block_q5_K * bq5_K = (const block_q5_K *) vbq; int vl[2]; @@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq, } return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); - -#else - -#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics - const block_q5_K * bq5_K = (const block_q5_K *) vbq; - - const int8_t * s = bq5_K->scales; - - const float d = bq5_K->d; - - const float d8_1 = bq8_1[0].ds[0]; - const float d8_2 = bq8_1[1].ds[1]; - - const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); - const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); - const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); - const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - - const int * ql = (const int *)bq5_K->qs + (iqs/2); - const int vl1 = ql[0]; - const int vl2 = ql[4]; - - const int step = 4 * (iqs/2); // 0, 4, 8, 12 - const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 - const int in = step%8; // 0, 4, 0, 4 - const int vh = (*((const int *)(bq5_K->qh + in))) >> im; - - const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); - const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); - const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); - const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); - - const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1]) - + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]); - - return d * sumf_d; - -#else - bad_arch(); -#endif // __SYCL_ARCH__ >= VER_4VEC - -#endif } template <int mmq_y> @@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql, const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; -#if QK_K == 256 x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; -#endif } #pragma unroll @@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs, const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs, const uint8_t *kmask_iq2xs) { -#if QK_K == 256 const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; #if QR2_XXS == 8 @@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, } return d * (sumi1 + sumi2); #endif -#else - assert(false); - return 0.f; -#endif } static __dpct_inline__ float @@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, const uint64_t *iq2xs_grid, const uint64_t *ksigns64) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; const int ib32 = iqs; @@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, assert(false); return 0.f; #endif -#else - assert(false); - return 0.f; -#endif } static __dpct_inline__ float vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#if QK_K == 256 const block_iq2_s * bq2 = (const block_iq2_s *) vbq; const int ib32 = iqs; @@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, } const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -#else - assert(false); -#endif } static __dpct_inline__ float @@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; const int ib32 = iqs; @@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, assert(false); return 0.f; #endif -#else - assert(false); - return 0.f; -#endif } static __dpct_inline__ float vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs, const uint32_t *iq3s_grid) { -#if QK_K == 256 const block_iq3_s * bq2 = (const block_iq3_s *) vbq; const int ib32 = iqs; @@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) * bq8_1[ib32].ds[0]; return d * sumi; -#else - assert(false); -#endif } static __dpct_inline__ float vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs, const uint32_t *iq1s_grid_gpu) { -#if QK_K == 256 const block_iq1_s * bq1 = (const block_iq1_s *) vbq; const int ib32 = iqs; @@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, const float d = d1q * bq8_1[ib32].ds[0]; const float m = d1q * bq8_1[ib32].ds[1]; return d * sumi + m * delta; -#else - assert(false); -#endif } static __dpct_inline__ float vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#if QK_K == 256 const block_iq1_m * bq1 = (const block_iq1_m *) vbq; const int ib32 = iqs; @@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); const float d = (float)scale.f16 * bq8_1[ib32].ds[0]; return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); -#else - assert(false); -#endif } static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, @@ -7720,7 +7322,6 @@ static __dpct_inline__ float vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#if QK_K == 256 const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; const uint8_t * values = (const uint8_t *)kvalues_iq4nl; @@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2); } return d * (sumi1 + sumi2); -#else - assert(false); -#endif } template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, @@ -10203,7 +9801,6 @@ template <typename dst_t> static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; -#if QK_K == 256 { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, dequantize_block_q2_K(vx, y, item_ct1); }); } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q2_K(vx, y, item_ct1); - }); - } - -#endif } template <typename dst_t> static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; -#if QK_K == 256 { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, dequantize_block_q3_K(vx, y, item_ct1); }); } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q3_K(vx, y, item_ct1); - }); - } -#endif } template <typename dst_t> @@ -10320,7 +9889,6 @@ template <typename dst_t> static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; -#if QK_K == 256 { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, dequantize_block_q5_K(vx, y, item_ct1); }); } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q5_K(vx, y, item_ct1); - }); - } - -#endif } template <typename dst_t> static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; -#if QK_K == 256 { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, dequantize_block_q6_K(vx, y, item_ct1); }); } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q6_K(vx, y, item_ct1); - }); - } - -#endif } template <typename dst_t> @@ -10529,9 +10068,6 @@ template <typename dst_t> static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = (k + QK_K - 1) / QK_K; -#if QK_K == 64 - dequantize_row_iq4_nl_sycl(vx, y, k, stream); -#else { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, }); }); } -#endif } @@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, const int nrows_y, const int nrows_dst, dpct::queue_ptr stream) try { -#if QK_K == 256 - int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); @@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, }); } } -#endif } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ |