diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-07-14 18:55:08 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-14 18:55:08 +0200 |
commit | 45fae1a14444622478774f9a417e1d417af1ca46 (patch) | |
tree | 2609ef06be5640749834d4fc691446771ab29f42 /ggml/src | |
parent | f5353047ef461e6fc9d527e09a06c9802c699929 (diff) |
Adding IQ2_KL (#602)
* Experiments for 2.6875 bpw quants
At least according to rmse, this is significantly better than
q2_K, while using only 1/16 more bits per weight.
* iq2_kl: basics
* iq2_kl: CUDA dequantize
* iq2_kl: small improvement in PPL
Also check the two neighbouring values for the block scale
and use the one that minimizes RMSE.
* iq2_kl: MMQ
Quite good: PP-512(L3-8B) = 8472 t/s.
* iq2_kl: MMVQ
We get PP-128(L3-8B) = 162 t/s.
Which means that this is not quite as good as it should be as
(almost) same bpq q2_K is at 170 t/s.
* iq2_kl: Zen4 GEMM/GEMV
Not particularly fast. I may need to think about rearranging the bits.
* iq2_kl: better Zen4
* iq2_kl: convert/repack to q8_k_r8 (AVX2)
* iq2_kl: AVX2 GEMM/GEMV
* iq2_kl: WIP NEON
The compiler started crashing!!!
* iq2_kl: NEON
Had to work around a compiler crash when using vzip2q_u8 using
vqtbl2q_u8.
* iq2_kl: convert/repack to q8_k_r8 (NEON)
* iq2_kl: Metal dequantize
* iq2_kl: Metal GEMV - pretty slow
* iq2_kl: Metal GEMV - slightly better (40 t/s -> 44.5 t/s)
* iq2_kl: Metal GEMV - slightly better (44.5 t/s -> 46.5 t/s)
* iq2_kl: Metal GEMV - slightly better (46.5 t/s -> 47.2 t/s)
* iq2_kl: slightly better Metal dequantize
PP-512 goes to 476 t/s up from 466 t/s.
* iq2_kl: slightly better Metal dequantize
PP-512 goes to 492 t/s up from 476 t/s.
* Add iq2_kl to constants.py
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src')
-rw-r--r-- | ggml/src/ggml-common.h | 14 | ||||
-rw-r--r-- | ggml/src/ggml-cuda.cu | 1 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/common.cuh | 7 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/convert.cu | 56 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/iqk_mmvq.cu | 54 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/iqk_mmvq.cuh | 5 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/mmq.cu | 4 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/mmq.cuh | 4 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/mmvq.cu | 4 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_kl.cu | 70 | ||||
-rw-r--r-- | ggml/src/ggml-metal.m | 40 | ||||
-rw-r--r-- | ggml/src/ggml-metal.metal | 183 | ||||
-rw-r--r-- | ggml/src/ggml-quants.c | 1 | ||||
-rw-r--r-- | ggml/src/ggml.c | 22 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_gemm_iqk_quants.cpp | 481 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_mul_mat.cpp | 15 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 314 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.h | 6 |
18 files changed, 1271 insertions, 10 deletions
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index a1f97911..6dc439b8 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -607,6 +607,14 @@ typedef struct { static_assert(sizeof(block_iq2_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/32 + QK_K/4, "wrong iq2_k block size/padding"); typedef struct { + uint16_t scales_h; + uint8_t scales_l[QK_K/64]; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/16]; +} block_iq2_kl; +static_assert(sizeof(block_iq2_kl) == sizeof(uint16_t) + QK_K/64 + QK_K/4 + QK_K/16, "wrong iq2_kl block size/padding"); + +typedef struct { ggml_half d[4]; uint8_t extra[8]; uint8_t scales[QK_K/8]; @@ -2164,6 +2172,12 @@ GGML_TABLE_BEGIN(int8_t, iq2nl_values, 8) -31, -13, 1, 17, -26, -8, 6, 22 GGML_TABLE_END() +GGML_TABLE_BEGIN(uint16_t, iq2kl_values, 32) + 0xe9c1, 0x0dc1, 0xc1d8, 0xf6d8, 0x0dd8, 0x2fd8, 0xd8e9, 0xe9e9, 0x01e9, 0x0de9, 0x1ce9, 0xc1f6, 0x01f6, 0x0df6, 0x2ff6, 0xe901, + 0xf601, 0x0101, 0x0d01, 0x1c01, 0xd80d, 0xe90d, 0xf60d, 0x010d, 0x0d0d, 0xc11c, 0xe91c, 0x011c, 0x1c1c, 0x2f1c, 0xe92f, 0x0d2f, +GGML_TABLE_END() + + GGML_TABLE_BEGIN(int8_t, iq3nl_values, 16) -63, -40, -23, -10, 1, 13, 28, 47, -59, -36, -19, -6, 5, 17, 32, 51, diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 7defb227..b33c952b 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3499,6 +3499,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ3_KS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ4_KSS: diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 973af2b8..38b52fd0 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -600,6 +600,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_K> { }; template<> +struct ggml_cuda_type_traits<GGML_TYPE_IQ2_KL> { + static constexpr int qk = QK_K; + static constexpr int qr = QR4_XS; + static constexpr int qi = QI4_XS; +}; + +template<> struct ggml_cuda_type_traits<GGML_TYPE_IQ3_KS> { static constexpr int qk = QK_K; static constexpr int qr = QR4_XS; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 61c09481..c8e02a83 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -1334,6 +1334,48 @@ static __global__ void dequantize_block_iq3_k(const void * __restrict__ vx, dst_ } template<typename dst_t> +static __global__ void dequantize_block_iq2_kl(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) { + + int64_t ii = blockIdx.x; + int64_t row = (QK_K * ii) / n_per_row; + const char * cx = (const char *)vx + row * row_size; + float scale = (float)*(const ggml_half *)cx; + const block_iq2_kl * x = (const block_iq2_kl *)(cx + sizeof(ggml_half)); + const int64_t i = ii - (row*n_per_row)/QK_K; + + const int64_t tid = threadIdx.x; + const int64_t ib64 = tid/8; + const int64_t il = tid%8; + dst_t * y = yy + ii*QK_K + 64*ib64 + 4*il; + const uint8_t * qs = x[i].qs + 16*ib64 + 2*il; + const uint8_t * qh = x[i].qh + 2*il; + auto sh = x[i].scales_h >> 4*ib64; + const float d1 = scale * (int(((x[i].scales_l[(2*ib64+0)%4] >> 4*(ib64/2)) & 0xf) | ((sh << 4) & 0x30)) - 32); + const float d2 = scale * (int(((x[i].scales_l[(2*ib64+1)%4] >> 4*(ib64/2)) & 0xf) | ((sh << 2) & 0x30)) - 32); + if constexpr (std::is_same_v<dst_t, nv_bfloat16>) { + for (int j = 0; j < 2; ++j) { + uint8_t h = qh[j] >> 2*ib64; + auto val1 = (const int8_t *)(iq2kl_values + ((qs[j] & 0xf) | ((h & 1) << 4))); + auto val2 = (const int8_t *)(iq2kl_values + ((qs[j] >> 4) | ((h & 2) << 3))); + y[2*j+ 0] = __float2bfloat16(d1 * val1[0]); + y[2*j+ 1] = __float2bfloat16(d1 * val1[1]); + y[2*j+32] = __float2bfloat16(d2 * val2[0]); + y[2*j+33] = __float2bfloat16(d2 * val2[1]); + } + } else { + for (int j = 0; j < 2; ++j) { + uint8_t h = qh[j] >> 2*ib64; + auto val1 = (const int8_t *)(iq2kl_values + ((qs[j] & 0xf) | ((h & 1) << 4))); + auto val2 = (const int8_t *)(iq2kl_values + ((qs[j] >> 4) | ((h & 2) << 3))); + y[2*j+ 0] = d1 * val1[0]; + y[2*j+ 1] = d1 * val1[1]; + y[2*j+32] = d2 * val2[0]; + y[2*j+33] = d2 * val2[1]; + } + } +} + +template<typename dst_t> static __global__ void dequantize_block_iq3_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) { int64_t ii = blockIdx.x; @@ -1619,6 +1661,14 @@ static void dequantize_row_iq3_k_cuda(const void * vx, dst_t * y, const int64_t } template<typename dst_t> +static void dequantize_row_iq2_kl_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { + const int64_t k = nrows * n_per_row; + const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_KL, n_per_row); + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_iq2_kl<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size); +} + +template<typename dst_t> static void dequantize_row_iq3_ks_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { const int64_t k = nrows * n_per_row; const int64_t row_size = ggml_row_size(GGML_TYPE_IQ3_KS, n_per_row); @@ -1772,6 +1822,8 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) { return dequantize_row_iq2_k_cuda<nv_bfloat16>; case GGML_TYPE_IQ3_K: return dequantize_row_iq3_k_cuda<nv_bfloat16>; + case GGML_TYPE_IQ2_KL: + return dequantize_row_iq2_kl_cuda<nv_bfloat16>; case GGML_TYPE_IQ3_KS: return dequantize_row_iq3_ks_cuda<nv_bfloat16>; case GGML_TYPE_IQ4_KSS: @@ -1876,6 +1928,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq2_k_cuda; case GGML_TYPE_IQ3_K: return dequantize_row_iq3_k_cuda; + case GGML_TYPE_IQ2_KL: + return dequantize_row_iq2_kl_cuda; case GGML_TYPE_IQ3_KS: return dequantize_row_iq3_ks_cuda; case GGML_TYPE_IQ4_K: @@ -1973,6 +2027,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq2_k_cuda; case GGML_TYPE_IQ3_K: return dequantize_row_iq3_k_cuda; + case GGML_TYPE_IQ2_KL: + return dequantize_row_iq2_kl_cuda; case GGML_TYPE_IQ3_KS: return dequantize_row_iq3_ks_cuda; case GGML_TYPE_IQ4_K: diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu index d897063f..a669390d 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cu +++ b/ggml/src/ggml-cuda/iqk_mmvq.cu @@ -1016,6 +1016,52 @@ __device__ __forceinline__ void vec_dot_iq3_k_q8_1( } +// TODO +__device__ __forceinline__ void vec_dot_iq2_kl_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iiqs, float * result) { + + float d = __half2float(*(const half *)vbq); + const block_iq2_kl * bq2 = (const block_iq2_kl *)((const char *)vbq + sizeof(half)) + kbx; + + int iqs = iiqs/4; + const int ib64 = iqs/2; // 0...3. 0 works on quants 0...63, 1 on quants 64...127, etc. + // Each thread processes 16 quants in each of the 2 32-blocks + const int il16 = iqs%2; // 0...3. 0 works on quants 0...7, 1 on quants 8...15, 2 on 16...23, 3 on 24...31 + + const uint16_t * ql = (const uint16_t *)bq2->qs + 8*ib64 + 4*il16; + const uint16_t * qh = (const uint16_t *)bq2->qh + 4*il16; + + int32_t aux32; + const uint8_t * aux8 = (const uint8_t *)&aux32; + + const int * q8l = (const int *)bq8_1[2*ib64+0].qs + 4*il16; + const int * q8h = (const int *)bq8_1[2*ib64+1].qs + 4*il16; + + int sumi1 = 0, sumi2 = 0; + int v1, v2; + for (int i = 0; i < 2; ++i) { + uint32_t vl = ql[2*i+0] | (ql[2*i+1] << 16); + uint32_t vh = (qh[2*i+0] | (qh[2*i+1] << 16)) >> 2*ib64; + + aux32 = (vl & 0x0f0f0f0f) | ((vh << 4) & 0x10101010); + v1 = iq2kl_values[aux8[0]] | (iq2kl_values[aux8[1]] << 16); + v2 = iq2kl_values[aux8[2]] | (iq2kl_values[aux8[3]] << 16); + sumi1 = ggml_cuda_dp4a(v1, q8l[2*i+0], ggml_cuda_dp4a(v2, q8l[2*i+1], sumi1)); + + aux32 = ((vl >> 4) & 0x0f0f0f0f) | ((vh << 3) & 0x10101010); + v1 = iq2kl_values[aux8[0]] | (iq2kl_values[aux8[1]] << 16); + v2 = iq2kl_values[aux8[2]] | (iq2kl_values[aux8[3]] << 16); + sumi2 = ggml_cuda_dp4a(v1, q8h[2*i+0], ggml_cuda_dp4a(v2, q8h[2*i+1], sumi2)); + } + + auto sh = bq2->scales_h >> 4*ib64; + int ls1 = int(((bq2->scales_l[(2*ib64+0)%4] >> 4*(ib64/2)) & 0xf) | ((sh << 4) & 0x30)) - 32; + int ls2 = int(((bq2->scales_l[(2*ib64+1)%4] >> 4*(ib64/2)) & 0xf) | ((sh << 2) & 0x30)) - 32; + + *result += d * (__low2float(bq8_1[2*ib64+0].ds) * ls1 * sumi1 + __low2float(bq8_1[2*ib64+1].ds) * ls2 * sumi2); + +} + __device__ __forceinline__ void vec_dot_iq3_ks_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iiqs, float * result) { @@ -1280,6 +1326,14 @@ void mul_mat_vec_iq4_ks_q8_1_cuda( iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ4_KS, VDR_IQ4_KS_Q8_1_MMVQ, vec_dot_iq4_ks_q8_1>(vx, vy, dst, ids_data, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream); } +void mul_mat_vec_iq2_kl_q8_1_cuda( + const void * vx, const void * vy, float * dst, const char * ids_data, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, + const int ne2, const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, int64_t ids_nb0, cudaStream_t stream) { + + iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ2_KL, VDR_IQ3_K_Q8_1_MMVQ, vec_dot_iq2_kl_q8_1>(vx, vy, dst, ids_data, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream); +} + void mul_mat_vec_iq3_ks_q8_1_cuda( const void * vx, const void * vy, float * dst, const char * ids_data, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cuh b/ggml/src/ggml-cuda/iqk_mmvq.cuh index c2416b1e..d14c3541 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cuh +++ b/ggml/src/ggml-cuda/iqk_mmvq.cuh @@ -16,6 +16,11 @@ void mul_mat_vec_iq3_k_q8_1_cuda( const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, const int ne2, const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, cudaStream_t stream); +void mul_mat_vec_iq2_kl_q8_1_cuda( + const void * vx, const void * vy, float * dst, const char * ids_data, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, + const int ne2, const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, cudaStream_t stream); + void mul_mat_vec_iq3_ks_q8_1_cuda( const void * vx, const void * vy, float * dst, const char * ids_data, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 231c4a41..cde5d044 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -94,6 +94,9 @@ void ggml_cuda_op_mul_mat_q( case GGML_TYPE_IQ4_NL: mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream); break; + case GGML_TYPE_IQ2_KL: + mul_mat_q_case<GGML_TYPE_IQ2_KL>(ctx, args, stream); + break; case GGML_TYPE_IQ3_KS: mul_mat_q_case<GGML_TYPE_IQ3_KS>(ctx, args, stream); break; @@ -201,6 +204,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ3_KS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ4_KS_R4: diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index ee34452a..21b50082 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -88,6 +88,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_K_R4: case GGML_TYPE_IQ3_K: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ3_KS: case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_KS: @@ -201,6 +202,7 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml case GGML_TYPE_IQ1_S_R4: return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_XS : return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_NL : return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ2_KL : return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ3_KS : return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_KS : return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_KS_R4 : return MMQ_DP4A_TXS_Q8_0; @@ -257,6 +259,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { case GGML_TYPE_IQ1_S_R4: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_XS : return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_NL : return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ2_KL : return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ3_KS : return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_KS : return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_KS_R4 : return MMQ_MMA_TILE_X_K_Q8_0; @@ -4156,6 +4159,7 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S); extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S); extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL); extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); +extern DECL_MMQ_CASE(GGML_TYPE_IQ2_KL); extern DECL_MMQ_CASE(GGML_TYPE_IQ3_KS); extern DECL_MMQ_CASE(GGML_TYPE_IQ4_KS); extern DECL_MMQ_CASE(GGML_TYPE_IQ4_KS_R4); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 2b619f67..d0746031 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -518,6 +518,9 @@ static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggm case GGML_TYPE_IQ3_K: mul_mat_vec_iq3_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ids_data, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream); break; + case GGML_TYPE_IQ2_KL: + mul_mat_vec_iq2_kl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ids_data, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream); + break; case GGML_TYPE_IQ3_KS: mul_mat_vec_iq3_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ids_data, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, ne2, nb02, nb12, nb2, ids_nb0, stream); break; @@ -682,6 +685,7 @@ bool ggml_cuda_mmvq_type_supported(ggml_type src0_type) { case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ3_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_kl.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_kl.cu new file mode 100644 index 00000000..a5c22879 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_kl.cu @@ -0,0 +1,70 @@ +#include "../mmq.cuh" + +template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_kl( + const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + +#ifdef INT8_MMA_AVAILABLE + int * x_qs = (int *) x_tile; + float * x_df = (float *) (x_qs + WARP_SIZE*2); +#else + constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y); + int * x_qs = (int *) x_tile; + float * x_df = (float *) (x_qs + txs.qs); +#endif // INT8_MMA_AVAILABLE + + const int kqsx = threadIdx.x/4; + + uint32_t aux32[2]; + const uint8_t * a8 = (const uint8_t *)aux32; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) { + int i = i0 + 4*threadIdx.y + threadIdx.x%4; + + if (need_check) { + i = min(i, i_max); + } + + const half * dptr = (const half *)(x + i*stride); + const float d = *dptr; + const block_iq2_kl * bxi = (const block_iq2_kl *)(dptr + 1) + kbx0; + + #pragma unroll + for (int j = 0; j < 2; ++j) { + auto ql = get_int_b2(bxi->qs, 4*(kqsx/2) + 2*(kqsx%2) + j); + auto qh = get_int_b2(bxi->qh, 2*(kqsx%2) + j) >> 2*(kqsx/2); + aux32[0] = ((ql >> 0) & 0x0f0f0f0f) | ((qh << 4) & 0x10101010); + aux32[1] = ((ql >> 4) & 0x0f0f0f0f) | ((qh << 3) & 0x10101010); + #pragma unroll + for (int l = 0; l < 2; ++l) { + int val1 = iq2kl_values[a8[2*l+0]] | (iq2kl_values[a8[2*l+1]] << 16); + int val2 = iq2kl_values[a8[2*l+4]] | (iq2kl_values[a8[2*l+5]] << 16); +#ifdef INT8_MMA_AVAILABLE + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 16*(kqsx/2) + 4*(kqsx%2) + 2*j + l + 0] = val1; + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 16*(kqsx/2) + 4*(kqsx%2) + 2*j + l + 8] = val2; +#else + x_qs[i*(2*WARP_SIZE + 1) + 16*(kqsx/2) + 4*(kqsx%2) + 2*j + l + 0] = val1; + x_qs[i*(2*WARP_SIZE + 1) + 16*(kqsx/2) + 4*(kqsx%2) + 2*j + l + 8] = val2; +#endif + } + } + + int ls = int(((bxi->scales_l[kqsx%4] >> 4*(kqsx/4)) & 0xf) | (((bxi->scales_h >> 2*kqsx) & 3) << 4)) - 32; + +#ifdef INT8_MMA_AVAILABLE + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = d * ls; +#else + x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = d * ls; +#endif + } + +} + +template <int mmq_x, int mmq_y, int nwarps, bool need_check> +struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_KL> { + static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_kl<mmq_y, nwarps, need_check>; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>; +}; + +DECL_MMQ_CASE(GGML_TYPE_IQ2_KL); diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index e1e49fcb..a86c66b6 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -112,6 +112,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_KSS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_K, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_KS, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_KL, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_K, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_K, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_K, @@ -159,6 +160,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_IQ5_KS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_KS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_KL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ5_K_F32, @@ -200,6 +202,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ5_KS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_KS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_KL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ5_K_F32, @@ -238,6 +241,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_KS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F32, @@ -276,6 +280,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_KS_F16, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_K_F16, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KS_F16, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KL_F16, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F16, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F16, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F16, @@ -314,6 +319,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_KS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_KS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_KL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_K_F32, @@ -768,6 +774,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_KS, get_rows_iq5_ks, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_K, get_rows_iq2_k, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_KS, get_rows_iq2_ks, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_KL, get_rows_iq2_kl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_K, get_rows_iq3_k, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_K, get_rows_iq4_k, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_K, get_rows_iq5_k, true); @@ -815,6 +822,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ5_KS_F32, mul_mv_iq5_ks_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_K_F32, mul_mv_iq2_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_KS_F32, mul_mv_iq2_ks_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_KL_F32, mul_mv_iq2_kl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_K_F32, mul_mv_iq3_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_K_F32, mul_mv_iq4_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ5_K_F32, mul_mv_iq5_k_f32, ctx->support_simdgroup_reduction); @@ -856,6 +864,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ5_KS_F32, mul_mv_id_iq5_ks_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_K_F32, mul_mv_id_iq2_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_KS_F32, mul_mv_id_iq2_ks_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_KL_F32, mul_mv_id_iq2_kl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_K_F32, mul_mv_id_iq3_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_K_F32, mul_mv_id_iq4_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ5_K_F32, mul_mv_id_iq5_k_f32, ctx->support_simdgroup_reduction); @@ -894,6 +903,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_KS_F32, mul_mm_iq5_ks_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_K_F32, mul_mm_iq2_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KS_F32, mul_mm_iq2_ks_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KL_F32, mul_mm_iq2_kl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F32, mul_mm_iq3_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F32, mul_mm_iq4_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F32, mul_mm_iq5_k_f32, ctx->support_simdgroup_mm); @@ -932,6 +942,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_KS_F16, mul_mm_iq5_ks_f16, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_K_F16, mul_mm_iq2_k_f16, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KS_F16, mul_mm_iq2_ks_f16, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KL_F16, mul_mm_iq2_kl_f16, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F16, mul_mm_iq3_k_f16, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F16, mul_mm_iq4_k_f16, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F16, mul_mm_iq5_k_f16, ctx->support_simdgroup_mm); @@ -970,6 +981,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_KS_F32, mul_mm_id_iq5_ks_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_K_F32, mul_mm_id_iq2_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_KS_F32, mul_mm_id_iq2_ks_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_KL_F32, mul_mm_id_iq2_kl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_K_F32, mul_mm_id_iq3_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_K_F32, mul_mm_id_iq4_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_K_F32, mul_mm_id_iq5_k_f32, ctx->support_simdgroup_mm); @@ -2187,6 +2199,7 @@ static void ggml_metal_encode_node( case GGML_TYPE_IQ5_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_KS_F32 ].pipeline; break; case GGML_TYPE_IQ2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_K_F32 ].pipeline; break; case GGML_TYPE_IQ2_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KS_F32 ].pipeline; break; + case GGML_TYPE_IQ2_KL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KL_F32 ].pipeline; break; case GGML_TYPE_IQ3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F32 ].pipeline; break; case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F32 ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F32 ].pipeline; break; @@ -2230,6 +2243,7 @@ static void ggml_metal_encode_node( case GGML_TYPE_IQ5_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_KS_F16 ].pipeline; break; case GGML_TYPE_IQ2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_K_F16 ].pipeline; break; case GGML_TYPE_IQ2_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KS_F16 ].pipeline; break; + case GGML_TYPE_IQ2_KL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_KL_F16 ].pipeline; break; case GGML_TYPE_IQ3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F16 ].pipeline; break; case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F16 ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F16 ].pipeline; break; @@ -2478,6 +2492,12 @@ static void ggml_metal_encode_node( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_KS_F32].pipeline; } break; + case GGML_TYPE_IQ2_KL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_KL_F32].pipeline; + } break; case GGML_TYPE_IQ3_K: { nth0 = 4; @@ -2555,8 +2575,10 @@ static void ggml_metal_encode_node( src0t == GGML_TYPE_IQ2_KT|| src0t == GGML_TYPE_IQ3_KT) { //|| src0t == GGML_TYPE_IQ4_KT) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_IQ2_KS || src0t == GGML_TYPE_IQ2_K || src0t == GGML_TYPE_IQ3_K || src0t == GGML_TYPE_IQ3_KS) { - const int mem_size = src0t == GGML_TYPE_IQ2_KS ? 64*sizeof(float) + else if (src0t == GGML_TYPE_IQ2_KS || src0t == GGML_TYPE_IQ2_K || src0t == GGML_TYPE_IQ3_K || src0t == GGML_TYPE_IQ3_KS || + src0t == GGML_TYPE_IQ2_KL) { + const int mem_size = src0t == GGML_TYPE_IQ2_KL ? 128*sizeof(float) + : src0t == GGML_TYPE_IQ2_KS ? 64*sizeof(float) : src0t == GGML_TYPE_IQ3_K || src0t == GGML_TYPE_IQ3_KS ? 32*sizeof(float) : 16*sizeof(float); [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -2675,6 +2697,7 @@ static void ggml_metal_encode_node( case GGML_TYPE_IQ5_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_KS_F32 ].pipeline; break; case GGML_TYPE_IQ2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_K_F32 ].pipeline; break; case GGML_TYPE_IQ2_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_KS_F32 ].pipeline; break; + case GGML_TYPE_IQ2_KL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_KL_F32 ].pipeline; break; case GGML_TYPE_IQ3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_K_F32 ].pipeline; break; case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_K_F32 ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_K_F32 ].pipeline; break; @@ -2907,6 +2930,12 @@ static void ggml_metal_encode_node( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_KS_F32].pipeline; } break; + case GGML_TYPE_IQ2_KL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_KL_F32].pipeline; + } break; case GGML_TYPE_IQ3_K: { nth0 = 4; @@ -2995,8 +3024,10 @@ static void ggml_metal_encode_node( src0t == GGML_TYPE_IQ2_KT|| src0t == GGML_TYPE_IQ3_KT) { //|| src0t == GGML_TYPE_IQ4_KT) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_IQ2_KS || src0t == GGML_TYPE_IQ2_K || src0t == GGML_TYPE_IQ3_K || src0t == GGML_TYPE_IQ3_KS) { - const int mem_size = src0t == GGML_TYPE_IQ2_KS ? 64*sizeof(float) + else if (src0t == GGML_TYPE_IQ2_KS || src0t == GGML_TYPE_IQ2_K || src0t == GGML_TYPE_IQ3_K || src0t == GGML_TYPE_IQ3_KS || + src0t == GGML_TYPE_IQ2_KL) { + const int mem_size = src0t == GGML_TYPE_IQ2_KL ? 128*sizeof(float) + : src0t == GGML_TYPE_IQ2_KS ? 64*sizeof(float) : src0t == GGML_TYPE_IQ3_K || src0t == GGML_TYPE_IQ3_KS ? 32*sizeof(float) : 16*sizeof(float); [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -3071,6 +3102,7 @@ static void ggml_metal_encode_node( case GGML_TYPE_IQ5_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_KS ].pipeline; break; case GGML_TYPE_IQ2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_K ].pipeline; break; case GGML_TYPE_IQ2_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_KS ].pipeline; break; + case GGML_TYPE_IQ2_KL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_KL ].pipeline; break; case GGML_TYPE_IQ3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_K ].pipeline; break; case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_K ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_K ].pipeline; break; diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index baaac407..53de59dd 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -4002,6 +4002,13 @@ constexpr constant static half kvalues_iq2k_h[8] = { -31.h, -13.h, 1.h, 17.h, - constexpr constant static float kvalues_iq3k_f[16] = { -63.f, -40.f, -23.f, -10.f, 1.f, 13.f, 28.f, 47.f, -59.f, -36.f, -19.f, -6.f, 5.f, 17.f, 32.f, 51.f }; constexpr constant static half kvalues_iq3k_h[16] = { -63.h, -40.h, -23.h, -10.h, 1.h, 13.h, 28.h, 47.h, -59.h, -36.h, -19.h, -6.h, 5.h, 17.h, 32.h, 51.h }; +constexpr constant static half2 kvalues_iq2kl_h[32] = { + {-63.h, -23.h}, {-63.h, 13.h}, {-40.h, -63.h}, {-40.h, -10.h}, {-40.h, 13.h}, {-40.h, 47.h}, {-23.h, -40.h}, {-23.h, -23.h}, + {-23.h, 1.h}, {-23.h, 13.h}, {-23.h, 28.h}, {-10.h, -63.h}, {-10.h, 1.h}, {-10.h, 13.h}, {-10.h, 47.h}, {1.h, -23.h}, {1.h, -10.h}, + {1.h, 1.h}, {1.h, 13.h}, {1.h, 28.h}, {13.h, -40.h}, {13.h, -23.h}, {13.h, -10.h}, {13.h, 1.h}, {13.h, 13.h}, {28.h, -63.h}, + {28.h, -23.h}, {28.h, 1.h}, {28.h, 28.h}, {28.h, 47.h}, {47.h, -23.h}, {47.h, 13.h}, +}; + kernel void kernel_cpy_f32_iq4_nl( device const float * src0, device void * dst, @@ -7231,6 +7238,152 @@ kernel void kernel_mul_mv_iq2_ks_f32( kernel_mul_mv_iq2_ks_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } +void kernel_mul_mv_iq2_kl_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const uint row_size = 2 + nb*sizeof(block_iq2_kl); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(ne01) + (i13/r3)*(ne01*ne02); + + device const char * cx0 = (device const char *) src0 + (first_row + offset0)*row_size; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float2 yl[16]; + float sumf[N_DST]={0.f}; + float drow[N_DST]; + + const int ix = tiisg/8; // 0...3 + const int it = tiisg%8; // 0...7 + const int iq = it/2; // 0...3 + const int ir = it%2; // 0 or 1 + + device const float * y4 = y + ix * QK_K + 64 * iq + 16 * ir; + + uint16_t aux16[2]; + thread const uint8_t * aux8 = (thread const uint8_t *)&aux16; + + device const char * cx = cx0; + for (int row = 0; row < N_DST; row++) { + device const half * dptr = (device const half *)cx; + drow[row] = dptr[0]; + cx += row_size; + } + + threadgroup float2 * all_values = (threadgroup float2 *)shared_values + 32*sgitg; + { + constant const int8_t * val = (constant const int8_t *)iq2kl_values; + all_values[tiisg][0] = val[2*tiisg + 0]; + all_values[tiisg][1] = val[2*tiisg + 1]; + simdgroup_barrier(mem_flags::mem_none); + } + + cx0 += sizeof(half); + + for (int ib = ix; ib < nb; ib += 4) { + + device const float2 * y2 = (device const float2 *)y4; + for (int i = 0; i < 8; ++i) { + yl[i+0] = y2[i+ 0]; + yl[i+8] = y2[i+16]; + } + + device const char * cx = cx0; + + for (int row = 0; row < N_DST; row++) { + + device const block_iq2_kl * x = (device const block_iq2_kl *)cx + ib; + + uint16_t h = x->scales_h >> 4*iq; + int8_t ls1 = int8_t(((x->scales_l[(2*iq+0)%4] >> 4*((2*iq+0)/4)) & 0xf) | ((h & 0x03) << 4)) - 32; + int8_t ls2 = int8_t(((x->scales_l[(2*iq+1)%4] >> 4*((2*iq+1)/4)) & 0xf) | ((h & 0x0c) << 2)) - 32; + + device const uint16_t * ql = (device const uint16_t *)x->qs + 8*iq + 4*ir; + device const uint16_t * qh = (device const uint16_t *)x->qh + 4*ir; + + float2 acc[2] = {0.f}; + for (int l = 0; l < 4; ++l) { + uint16_t h = qh[l] >> 2*iq; + aux16[0] = ((ql[l] >> 0) & 0x0f0f) | ((h & 0x0101) << 4); + aux16[1] = ((ql[l] >> 4) & 0x0f0f) | ((h & 0x0202) << 3); + for (int j = 0; j < 2; ++j) { + threadgroup const float2 & val1 = all_values[aux8[j+0]]; + threadgroup const float2 & val2 = all_values[aux8[j+2]]; + acc[0] += yl[2*l+j+0] * val1; + acc[1] += yl[2*l+j+8] * val2; + } + + } + sumf[row] += drow[row] * ((acc[0][0] + acc[0][1]) * ls1 + (acc[1][0] + acc[1][1]) * ls2); + + cx += row_size; + + } + + y4 += 4 * QK_K; + } + + for (int row = 0; row < N_DST; row += 2) { + float2 tmp = {sumf[row], sumf[row+1]}; + tmp = simd_sum(tmp); + if (tiisg < 2) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row + tiisg] = tmp[tiisg]; + } + } +} + +[[host_name("kernel_mul_mv_iq2_kl_f32")]] +kernel void kernel_mul_mv_iq2_kl_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq2_kl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + void kernel_mul_mv_iq3_k_f32_impl( device const void * src0, device const float * src1, @@ -8821,6 +8974,31 @@ void dequantize_iq2_ks(device const block_iq2_ks * xb, short il, thread type4x4 } template <typename type4x4> +void dequantize_iq2_kl(device const block_iq2_kl * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 + const short ib32 = il/2; + device const uint16_t * ql = (device const uint16_t * )xb->qs + 8*(ib32/2) + 4*(il%2); + device const uint16_t * qh = (device const uint16_t * )xb->qh + 4*(il%2); + + half d = (int16_t(((xb->scales_l[ib32%4] >> 4*(ib32/4)) & 0xf) | (((xb->scales_h >> 2*ib32) & 0x3) << 4)) - 32); + + uint32_t aux32[2]; + thread const uint8_t * aux8 = (thread const uint8_t *)aux32; + + aux32[0] = (((ql[0] | (ql[1] << 16)) >> 4*(ib32%2)) & 0x0f0f0f0f) | ((((qh[0] | (qh[1] << 16)) >> ib32) & 0x01010101) << 4); + aux32[1] = (((ql[2] | (ql[3] << 16)) >> 4*(ib32%2)) & 0x0f0f0f0f) | ((((qh[2] | (qh[3] << 16)) >> ib32) & 0x01010101) << 4); + + for (int i = 0; i < 4; ++i) { + constant const half2 & val1 = *(constant const half2 *)(kvalues_iq2kl_h + aux8[2*i+0]); + constant const half2 & val2 = *(constant const half2 *)(kvalues_iq2kl_h + aux8[2*i+1]); + reg[i][0] = d * val1[0]; + reg[i][1] = d * val1[1]; + reg[i][2] = d * val2[0]; + reg[i][3] = d * val2[1]; + } +} + +template <typename type4x4> void dequantize_iq3_k(device const block_iq3_k * xb, short il, thread type4x4 & reg) { // il is 0...15 for QK_K = 256 device const uint16_t * q16l = (device const uint16_t *)xb->qs + 16*(il/8) + 8*(il&1); @@ -9596,6 +9774,7 @@ template [[host_name("kernel_get_rows_iq4_ks")]] kernel get_rows_q_t kernel_get template [[host_name("kernel_get_rows_iq5_ks")]] kernel get_rows_q_t kernel_get_rows_q2<DequantizerRS<float4x4, block_iq5_ks, float, 16, dequantize_iq5_ks>>; template [[host_name("kernel_get_rows_iq4_kss")]] kernel get_rows_q_t kernel_get_rows_q2<DequantizerRS<float4x4, block_iq4_kss,float, 16, dequantize_iq4_kss>>; template [[host_name("kernel_get_rows_iq2_ks")]] kernel get_rows_q_t kernel_get_rows_q2<DequantizerRS<float4x4, block_iq2_ks, half, 16, dequantize_iq2_ks>>; +template [[host_name("kernel_get_rows_iq2_kl")]] kernel get_rows_q_t kernel_get_rows_q2<DequantizerRS<float4x4, block_iq2_kl, half, 16, dequantize_iq2_kl>>; template [[host_name("kernel_get_rows_iq2_kt")]] kernel get_rows_q_t kernel_get_rows_q2<DequantizerRS<float4x4, block_iq2_kt, float, 16, dequantize_iq2_kt>>; template [[host_name("kernel_get_rows_iq3_kt")]] kernel get_rows_q_t kernel_get_rows_q2<DequantizerRS<float4x4, block_iq3_kt, float, 16, dequantize_iq3_kt>>; template [[host_name("kernel_get_rows_iq4_kt")]] kernel get_rows_q_t kernel_get_rows_q2<DequantizerKT4<float4x4, 16>>; @@ -9644,6 +9823,7 @@ template [[host_name("kernel_mul_mm_iq4_ks_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq5_ks_f32")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq5_ks, float, 16, dequantize_iq5_ks>, float>; template [[host_name("kernel_mul_mm_iq4_kss_f32")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq4_kss,float, 16, dequantize_iq4_kss>, float>; template [[host_name("kernel_mul_mm_iq2_ks_f32")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq2_ks, half, 16, dequantize_iq2_ks>, float>; +template [[host_name("kernel_mul_mm_iq2_kl_f32")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq2_kl, half, 16, dequantize_iq2_kl>, float>; template [[host_name("kernel_mul_mm_iq2_kt_f32")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq2_kt, float, 16, dequantize_iq2_kt>, float>; template [[host_name("kernel_mul_mm_iq3_kt_f32")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq3_kt, float, 16, dequantize_iq3_kt>, float>; template [[host_name("kernel_mul_mm_iq4_kt_f32")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerKT4<half4x4, 16>, float>; @@ -9683,6 +9863,7 @@ template [[host_name("kernel_mul_mm_iq4_ks_f16")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq5_ks_f16")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq5_ks, float, 16, dequantize_iq5_ks>, half>; template [[host_name("kernel_mul_mm_iq4_kss_f16")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq4_kss,float, 16, dequantize_iq4_kss>, half>; template [[host_name("kernel_mul_mm_iq2_ks_f16")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq2_ks, half, 16, dequantize_iq2_ks>, half>; +template [[host_name("kernel_mul_mm_iq2_kl_f16")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq2_kl, half, 16, dequantize_iq2_kl>, half>; template [[host_name("kernel_mul_mm_iq2_kt_f16")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq2_kt, float, 16, dequantize_iq2_kt>, half>; template [[host_name("kernel_mul_mm_iq3_kt_f16")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerRS<half4x4, block_iq3_kt, float, 16, dequantize_iq3_kt>, half>; template [[host_name("kernel_mul_mm_iq4_kt_f16")]] kernel mat_mm_t kernel_mul_mm<half, simdgroup_half8x8, DequantizerKT4<half4x4, 16>, half>; @@ -9729,6 +9910,7 @@ template [[host_name("kernel_mul_mm_id_iq4_ks_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq5_ks_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<DequantizerRS<half4x4, block_iq5_ks, float, 16, dequantize_iq5_ks>>; template [[host_name("kernel_mul_mm_id_iq4_kss_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<DequantizerRS<half4x4, block_iq4_kss,float, 16, dequantize_iq4_kss>>; template [[host_name("kernel_mul_mm_id_iq2_ks_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<DequantizerRS<half4x4, block_iq2_ks, half, 16, dequantize_iq2_ks>>; +template [[host_name("kernel_mul_mm_id_iq2_kl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<DequantizerRS<half4x4, block_iq2_kl, half, 16, dequantize_iq2_kl>>; template [[host_name("kernel_mul_mm_id_iq2_kt_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<DequantizerRS<half4x4, block_iq2_kt, float, 16, dequantize_iq2_kt>>; template [[host_name("kernel_mul_mm_id_iq3_kt_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<DequantizerRS<half4x4, block_iq3_kt, float, 16, dequantize_iq3_kt>>; template [[host_name("kernel_mul_mm_id_iq4_kt_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<DequantizerKT4<half4x4, 16>>; @@ -9951,6 +10133,7 @@ template [[host_name("kernel_mul_mv_id_iq5_ks_f32")]] kernel kernel_mul_mv_id_t template [[host_name("kernel_mul_mv_id_iq4_kss_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_kss_f32_impl>>; template [[host_name("kernel_mul_mv_id_iq2_k_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_k_f32_impl>>; template [[host_name("kernel_mul_mv_id_iq2_ks_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_ks_f32_impl>>; +template [[host_name("kernel_mul_mv_id_iq2_kl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_kl_f32_impl>>; template [[host_name("kernel_mul_mv_id_iq2_kt_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_kt_f32_impl>>; template [[host_name("kernel_mul_mv_id_iq3_kt_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_kt_f32_impl>>; template [[host_name("kernel_mul_mv_id_iq4_kt_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_kt_f32_impl>>; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 96242727..e18cee73 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15426,6 +15426,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ4_KT: break; case GGML_TYPE_IQ3_K: break; case GGML_TYPE_IQ3_KS: break; + case GGML_TYPE_IQ2_KL: break; case GGML_TYPE_IQ4_K: break; case GGML_TYPE_IQ5_K: break; case GGML_TYPE_IQ6_K: break; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2e6983df..dbb080f8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1669,6 +1669,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .nrows = 1, .row_meta_size = 2, }, + [GGML_TYPE_IQ2_KL] = { + .type_name = "iq2_kl", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_kl), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_kl, + .from_float = quantize_row_iq2_kl, + .from_float_ref = (ggml_from_float_t)quantize_row_iq2_kl_ref, + .vec_dot = vec_dot_iq2_kl_q8_k, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + .row_meta_size = 2, + }, [GGML_TYPE_IQ4_K] = { .type_name = "iq4_k", .blck_size = QK_K, @@ -4592,6 +4605,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_KT: wtype = GGML_TYPE_IQ4_KT; break; case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break; case GGML_FTYPE_MOSTLY_IQ3_KS: wtype = GGML_TYPE_IQ3_KS; break; + case GGML_FTYPE_MOSTLY_IQ2_KL: wtype = GGML_TYPE_IQ2_KL; break; case GGML_FTYPE_MOSTLY_IQ4_K: wtype = GGML_TYPE_IQ4_K; break; case GGML_FTYPE_MOSTLY_IQ3_K_R4: wtype = GGML_TYPE_IQ3_K_R4; break; case GGML_FTYPE_MOSTLY_IQ4_K_R4: wtype = GGML_TYPE_IQ4_K_R4; break; @@ -11362,6 +11376,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ3_KS: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_K_R4: @@ -11840,6 +11855,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ3_KS: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_K_R4: @@ -12015,6 +12031,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ3_KS: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_K_R4: @@ -15517,6 +15534,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ3_KS: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_K_R4: @@ -15932,6 +15950,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ3_KS: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_K_R4: @@ -16253,6 +16272,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ3_KS: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_K_R4: @@ -16891,6 +16911,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ4_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ3_KS: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ3_K_R4: case GGML_TYPE_IQ4_K_R4: @@ -23965,6 +23986,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ4_KT: result = quantize_iq4_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_KS: result = quantize_iq3_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_KL: result = quantize_iq2_kl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_K_R4:result = quantize_iq3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_K_R4:result = quantize_iq4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_gemm_iqk_quants.cpp b/ggml/src/iqk/iqk_gemm_iqk_quants.cpp index 57038d0c..ba2fa235 100644 --- a/ggml/src/iqk/iqk_gemm_iqk_quants.cpp +++ b/ggml/src/iqk/iqk_gemm_iqk_quants.cpp @@ -337,6 +337,97 @@ struct DequantizerIQ4KSS final : public BaseDequantizer<block_iq4_kss, true> { }; }; +struct DequantizerIQ2KL final : public BaseDequantizer<block_iq2_kl, true, true> { + DequantizerIQ2KL(const void * vx, size_t bx) : BaseDequantizer(vx, bx) { load_values(); } + inline __m128i make_scales(int i) const { + uint32_t aux32; std::memcpy(&aux32, x[i].scales_l, 4); + auto scl = _mm_cvtepu8_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(aux32), _mm_set_epi32(0, 0, 4, 0)), _mm_set1_epi8(0xf))); + auto sch = _mm_srlv_epi16(_mm_sllv_epi64(_mm_set1_epi16(x[i].scales_h), _mm_set_epi64x(0, 8)), _mm_set1_epi64x(0x000a000800060004)); + auto scales128 = _mm_sub_epi16(_mm_or_si128(scl, _mm_and_si128(sch, _mm_set1_epi16(0x30))), _mm_set1_epi16(32)); + return scales128; + } + template <typename Q8> + inline void compute_block(int i, const Q8& q8, __m512 * acc) { + auto scales128 = make_scales(i); + auto mins128 = _mm_mullo_epi16(scales128, _mm_set1_epi16(-64)); + auto mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, s8k.shuffles[1]), _mm_shuffle_epi8(mins128, s8k.shuffles[0])); + auto scales256 = MM256_SET_M128I(scales128, scales128); + auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1); + __m512i scales[4]; + for (int k = 0; k < 4; ++k) scales[k] = _mm512_shuffle_epi8(all_scales, shuffles[k]); + prepare(i); + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto q8s = q8.load_bsums(iy, i); + auto prod = _mm256_madd_epi16(mins, q8s); + auto sumi = _mm512_inserti32x8(_mm512_setzero_si512(), prod, 0); + for (int k = 0; k < 4; ++k) { + auto p = _mm512_maddubs_epi16(bits.values[k], q8.load_quants64(iy, i, k)); + sumi = _mm512_dpwssd_epi32(sumi, p, scales[k]); + } + acc[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), acc[iy]); + } + } + inline void prepare(int i) { + __m512i ql[2], qs[4]; + __mmask64 mask[2]; + auto lbits = _mm512_loadu_si512((const __m512i *)x[i].qs); + ql[0] = _mm512_and_si512(lbits, m4); + ql[1] = _mm512_and_si512(_mm512_srli_epi16(lbits, 4), m4); + auto tmp = _mm512_permutex2var_epi64(ql[0], permute1, ql[1]); + ql[1] = _mm512_permutex2var_epi64(ql[0], permute2, ql[1]); + ql[0] = tmp; + auto h128 = _mm_loadu_si128((const __m128i *)x[i].qh); + auto h256 = MM256_SET_M128I(_mm_srli_epi16(h128, 1), h128); + auto h512 = _mm512_inserti32x8(_mm512_castsi256_si512(h256), _mm256_srli_epi16(h256, 2), 1); + mask[0] = _mm512_cmpeq_epi8_mask(_mm512_and_si512(h512, m01), m01); + mask[1] = _mm512_cmpeq_epi8_mask(_mm512_and_si512(h512, m10), m10); + + for (int k = 0; k < 2; ++k) { + // qs[0]: even quants when hbits is not set (so pair index is in 0...15) + // qs[1]: even quants when hbits is set (so pair index is in 16...31) + // qs[2]: odd quants when hbits is not set (so pair index is in 0...15) + // qs[3]: odd quants when hbits is set (so pair index is in 16...31) + // if we blend qs[0] and qs[1] with the hbit mask, we get the correct even quants -> q1 + // if we blend qs[2] and qs[3] with the hbit mask, we get the correct odd quants -> q2 + // If we convert q1 and q2 to int16_t, shift q2 left by 8 bits, and or them, we get the quants in the correct order + for (int l = 0; l < 4; ++l) qs[l] = _mm512_shuffle_epi8(values[l], ql[k]); + auto q1 = _mm512_mask_blend_epi8(mask[k], qs[0], qs[1]); + auto q2 = _mm512_mask_blend_epi8(mask[k], qs[2], qs[3]); + auto t1 = _mm512_unpacklo_epi8(q1, q2); // 0...15, 32...47, 64...79, 96...111 + auto t2 = _mm512_unpackhi_epi8(q1, q2); // 16...31, 48...63, 80...95, 112...127 + bits.values[2*k+0] = _mm512_permutex2var_epi64(t1, permute1, t2); + bits.values[2*k+1] = _mm512_permutex2var_epi64(t1, permute2, t2); + } + } + void load_values() { + static const uint8_t k_values[64] = { + 1, 1, 24, 24, 24, 24, 41, 41, 41, 41, 41, 54, 54, 54, 54, 65, 65, 65, 65, 65, 77, 77, 77, 77, 77, 92, 92, 92, 92, 92, 111, 111, + 41, 77, 1, 54, 77, 111, 24, 41, 65, 77, 92, 1, 65, 77, 111, 41, 54, 65, 77, 92, 24, 41, 54, 65, 77, 1, 41, 65, 92, 111, 41, 77, + }; + for (int k = 0; k < 4; ++k) { + auto v128 = _mm_loadu_si128((const __m128i *)k_values + k); + auto v256 = MM256_SET_M128I(v128, v128); + values[k] = _mm512_inserti32x8(_mm512_castsi256_si512(v256), v256, 1); + } + } + + struct { __m512i values[4]; } bits; + Scales8KBase s8k; + const __m512i m01 = _mm512_set1_epi8(0x01); + const __m512i m10 = _mm512_set1_epi8(0x10); + const __m512i m4 = _mm512_set1_epi8(0xf); + const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); + const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); + __m512i values[4]; + const __m512i shuffles[4] = { + _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1), + }; +}; + + struct DequantizerIQ4KS final : public BaseDequantizer<block_iq4_ks, true> { DequantizerIQ4KS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {} template <typename Q8> @@ -908,6 +999,68 @@ struct DequantizerIQ2KS final : public BaseDequantizer<block_iq2_ks, true, true> const __m128i shift = _mm_set_epi32(0, 0, 4, 0); }; +struct DequantizerIQ2KL final : public BaseDequantizer<block_iq2_kl, true, true> { + DequantizerIQ2KL(const void * vx, size_t bx) : BaseDequantizer(vx, bx) { load_values(); } + template <typename Q8> + inline __m256i new_block(int i, const Q8& q8, __m256 * accm) { + auto hbits128 = _mm_loadu_si128((const __m128i *)x[i].qh); + hbits = MM256_SET_M128I(_mm_srli_epi16(hbits128, 1), hbits128); + auto scales128 = make_scales(i); + auto scales_s = _mm_mullo_epi16(scales128, _mm_set1_epi16(-64)); + s8k.accum_mins(scales_s, q8, i, d, accm); + return MM256_SET_M128I(scales128, scales128); + } + inline void prepare(int i, int j) { + __m256i ql[2], mask[2]; + auto b1 = _mm_loadu_si128((const __m128i *)x[i].qs+2*j+0); + auto b2 = _mm_loadu_si128((const __m128i *)x[i].qs+2*j+1); + ql[0] = _mm256_and_si256(_mm256_set1_epi8(0xf), MM256_SET_M128I(_mm_srli_epi16(b1, 4), b1)); + ql[1] = _mm256_and_si256(_mm256_set1_epi8(0xf), MM256_SET_M128I(_mm_srli_epi16(b2, 4), b2)); + mask[0] = _mm256_cmpeq_epi8(_mm256_and_si256(hbits, _mm256_set1_epi8(0x1)), _mm256_set1_epi8(0x1)); + mask[1] = _mm256_cmpeq_epi8(_mm256_and_si256(hbits, _mm256_set1_epi8(0x4)), _mm256_set1_epi8(0x4)); + for (int k = 0; k < 2; ++k) { + auto v0 = _mm256_shuffle_epi8(values[0], ql[k]); + auto v1 = _mm256_shuffle_epi8(values[1], ql[k]); + auto v2 = _mm256_shuffle_epi8(values[2], ql[k]); + auto v3 = _mm256_shuffle_epi8(values[3], ql[k]); + auto q1 = _mm256_or_si256(_mm256_and_si256(mask[k], v1), _mm256_andnot_si256(mask[k], v0)); + auto q2 = _mm256_or_si256(_mm256_and_si256(mask[k], v3), _mm256_andnot_si256(mask[k], v2)); + auto q1l = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(q1)); + auto q1h = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(q1, 1)); + auto q2l = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(q2)); + auto q2h = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(q2, 1)); + bits.values[2*k+0] = _mm256_or_si256(q1l, _mm256_slli_epi16(q2l, 8)); + bits.values[2*k+1] = _mm256_or_si256(q1h, _mm256_slli_epi16(q2h, 8)); + } + hbits = _mm256_srli_epi16(hbits, 4); + } + inline __m128i make_scales(int i) const { + uint32_t aux32; std::memcpy(&aux32, x[i].scales_l, 4); + auto scl = _mm_cvtepu8_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(aux32), shift), _mm_set1_epi8(0xf))); + auto sch = _mm_srlv_epi32(_mm_set1_epi16(x[i].scales_h), _mm_set_epi32(12, 8, 4, 0)); + sch = _mm_and_si128(sch, _mm_set1_epi32(0x000c0003)); + sch = _mm_mullo_epi16(sch, _mm_set1_epi32(0x00040010)); + auto scales128 = _mm_sub_epi16(_mm_or_si128(scl, sch), _mm_set1_epi16(32)); + return scales128; + } + void load_values() { + static const uint8_t k_values[64] = { + 1, 1, 24, 24, 24, 24, 41, 41, 41, 41, 41, 54, 54, 54, 54, 65, 65, 65, 65, 65, 77, 77, 77, 77, 77, 92, 92, 92, 92, 92, 111, 111, + 41, 77, 1, 54, 77, 111, 24, 41, 65, 77, 92, 1, 65, 77, 111, 41, 54, 65, 77, 92, 24, 41, 54, 65, 77, 1, 41, 65, 92, 111, 41, 77, + }; + for (int k = 0; k < 4; ++k) { + auto v128 = _mm_loadu_si128((const __m128i *)k_values + k); + values[k] = MM256_SET_M128I(v128, v128); + } + } + struct { __m256i values[4]; } bits; + Scales8KBase s8k; + + __m256i values[4]; + __m256i hbits; + const __m128i shift = _mm_set_epi32(0, 0, 4, 0); +}; + struct DequantizerIQ2K final : public BaseDequantizer<block_iq2_k> { DequantizerIQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx), iqxk(5, -32), values(load_values()) {} template <typename Q8> @@ -2127,6 +2280,7 @@ static void mul_mat_iq5_ks_r4_q8_k(int n, const void * vx, size_t bx, const Data template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX_NY>& funcs) { #ifdef HAVE_FANCY_SIMD if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2KS> || + std::is_same_v<Dequantizer, DequantizerIQ2KL> || std::is_same_v<Dequantizer, DequantizerIQ3KS> || std::is_same_v<Dequantizer, DequantizerIQ4KS> || std::is_same_v<Dequantizer, DequantizerIQ5KS>) { @@ -2267,6 +2421,94 @@ void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in } } +void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) { + GGML_ASSERT(n%QK_K == 0); + GGML_ASSERT(nrc_x%8 == 0); + + int nb = n/QK_K; + + const block_iq2_kl * x8[8]; + + block_q8_k_r8 * y = (block_q8_k_r8 *)vy; + + __m256i values[4]; + { + static const int8_t k_values[64] = { + -63, -63, -40, -40, -40, -40, -23, -23, -23, -23, -23, -10, -10, -10, -10, 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 28, 28, 28, 28, 28, 47, 47, + -23, 13, -63, -10, 13, 47, -40, -23, 1, 13, 28, -63, 1, 13, 47, -23, -10, 1, 13, 28, -40, -23, -10, 1, 13, -63, -23, 1, 28, 47, -23, 13, + }; + for (int k = 0; k < 4; ++k) { + auto v = _mm_loadu_si128((const __m128i *)k_values + k); + values[k] = MM256_SET_M128I(v, v); + } + } + + ggml_half dh[8]; + float dnew[8]; + uint32_t block[8]; + int16_t ls[16]; + + __m256i xv[8]; + __m256i ql[2]; + __m256i mask[2]; + + uint32_t sl32; + const auto sl8 = (const int8_t *)&sl32; + + for (int ix = 0; ix < nrc_x; ix += 8) { + for (int k = 0; k < 8; ++k) { + const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix+k)*bx); + dh[k] = dptr[0]; + x8[k] = (const block_iq2_kl *)(dptr + 1); + } + for (int i = 0; i < nb; ++i) { + for (int k = 0; k < 8; ++k) { + uint32_t aux32; + std::memcpy(&aux32, x8[k][i].scales_l, 4); + auto sh = x8[k][i].scales_h; + auto hbits128 = _mm_loadu_si128((const __m128i *)x8[k][i].qh); + auto hbits = MM256_SET_M128I(_mm_srli_epi16(hbits128, 1), hbits128); + //auto sl = _mm_and_si128(_mm_cvtepu8_epi16(_mm_srlv_epi32(_mm_set1_epi32(aux32), _mm_set_epi32(0, 0, 4, 0))), _mm_set1_epi16(0xf)); + for (int i128 = 0; i128 < 2; ++i128) { + sl32 = aux32 & 0x0f0f0f0f; + ls[8*i128+0] = ls[8*i128+1] = (sl8[0] | ((sh << 4) & 0x30)) - 32; + ls[8*i128+2] = ls[8*i128+3] = (sl8[1] | ((sh << 2) & 0x30)) - 32; + ls[8*i128+4] = ls[8*i128+5] = (sl8[2] | ((sh >> 0) & 0x30)) - 32; + ls[8*i128+6] = ls[8*i128+7] = (sl8[3] | ((sh >> 2) & 0x30)) - 32; + aux32 >>= 4; sh >>= 8; + { + auto b1 = _mm_loadu_si128((const __m128i *)x8[k][i].qs+2*i128+0); + auto b2 = _mm_loadu_si128((const __m128i *)x8[k][i].qs+2*i128+1); + ql[0] = _mm256_and_si256(_mm256_set1_epi8(0xf), MM256_SET_M128I(_mm_srli_epi16(b1, 4), b1)); + ql[1] = _mm256_and_si256(_mm256_set1_epi8(0xf), MM256_SET_M128I(_mm_srli_epi16(b2, 4), b2)); + } + mask[0] = _mm256_cmpeq_epi8(_mm256_and_si256(hbits, _mm256_set1_epi8(0x1)), _mm256_set1_epi8(0x1)); + mask[1] = _mm256_cmpeq_epi8(_mm256_and_si256(hbits, _mm256_set1_epi8(0x4)), _mm256_set1_epi8(0x4)); + for (int k = 0; k < 2; ++k) { + auto v0 = _mm256_shuffle_epi8(values[0], ql[k]); + auto v1 = _mm256_shuffle_epi8(values[1], ql[k]); + auto v2 = _mm256_shuffle_epi8(values[2], ql[k]); + auto v3 = _mm256_shuffle_epi8(values[3], ql[k]); + auto q1 = _mm256_or_si256(_mm256_and_si256(mask[k], v1), _mm256_andnot_si256(mask[k], v0)); + auto q2 = _mm256_or_si256(_mm256_and_si256(mask[k], v3), _mm256_andnot_si256(mask[k], v2)); + auto q1l = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(q1)); + auto q1h = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(q1, 1)); + auto q2l = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(q2)); + auto q2h = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(q2, 1)); + xv[4*i128+2*k+0] = _mm256_or_si256(q1l, _mm256_slli_epi16(q2l, 8)); + xv[4*i128+2*k+1] = _mm256_or_si256(q1h, _mm256_slli_epi16(q2h, 8)); + } + hbits = _mm256_srli_epi16(hbits, 4); + } + dnew[k] = convert_to_q8_k_r8(k, 1.f/125, xv, ls, block, y[i].qs); + } + auto vd = _mm256_mul_ps(_mm256_loadu_ps(dnew), _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)dh))); + _mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(vd, _MM_ROUND_NEAREST)); + } + y += nb; + } +} + void iqk_convert_iq2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) { GGML_ASSERT(n%QK_K == 0); GGML_ASSERT(nrc_x%8 == 0); @@ -2887,6 +3129,7 @@ bool iqk_convert_iqk_quants_q80_r8(int type, int n, const void * vx, size_t bx, switch (ggml_type(type)) { case GGML_TYPE_IQ2_KS : iqk_convert_iq2_ks_q8_k_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_IQ2_K : iqk_convert_iq2_k_q8_k_r8 (n, vx, bx, vy, nrc_x); break; + case GGML_TYPE_IQ2_KL : iqk_convert_iq2_kl_q8_k_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_IQ3_KS : iqk_convert_iq3_ks_q8_k_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_IQ3_K : iqk_convert_iq3_k_q8_k_r8 (n, vx, bx, vy, nrc_x); break; case GGML_TYPE_IQ4_KS : iqk_convert_iq4_ks_q8_k_r8(n, vx, bx, vy, nrc_x); break; @@ -2916,6 +3159,12 @@ bool iqk_set_kernels_iqk_quants(int ne00, int typeA, int typeB, std::array<mul_m case GGML_TYPE_IQ2_K: set_functions<DequantizerIQ2K>(kernels); break; + case GGML_TYPE_IQ2_KL: + set_functions<DequantizerIQ2KL>(kernels); +#ifdef HAVE_FANCY_SIMD + func16 = mul_mat_iqX_k_q8_K_AVX512_new<DequantizerIQ2KL, 16>; +#endif + break; case GGML_TYPE_IQ3_KS: set_functions<DequantizerIQ3KS>(kernels); break; @@ -3469,6 +3718,98 @@ struct DequantizerIQ2KS final : public BaseDequantizer<block_iq2_ks, true, true> }; +struct DequantizerIQ2KL final : public BaseDequantizer<block_iq2_kl, true, true> { + DequantizerIQ2KL(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), shuff(load_shuffle()), shifts(load_shift()) { load_values(values); } + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + template <typename Q8> + inline int32x4x2_t new_block(int i, [[maybe_unused]] const Q8& q8, [[maybe_unused]] float32x4_t * acc) { + uint32_t aux32; std::memcpy(&aux32, x[i].scales_l, 4); + auto scl = vand_u8(vdup_n_u8(0xf), vreinterpret_u8_u32(uint32x2_t{aux32, aux32 >> 4})); + auto sch = vandq_u16(vshlq_u16(vdupq_n_u16(x[i].scales_h), shifts), vdupq_n_u16(0x30)); + auto scales16 = vsubq_s16(vreinterpretq_s16_u16(vorrq_u16(sch, vmovl_u8(scl))), vdupq_n_s16(32)); + int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))}; + return scales; + } + inline void process_pair(uint8x16_t x, uint8x16_t * val) const { + uint8x16x2_t aux{ vqtbl2q_s8(values[0], x), vqtbl2q_s8(values[1], x) }; + val[0] = vqtbl2q_u8(aux, shuff.val[0]); + val[1] = vqtbl2q_u8(aux, shuff.val[1]); + } + inline void prepare(int i, int j) { + hbits = j == 0 ? vld1q_u8(x[i].qh) : vshrq_n_u8(hbits, 4); + auto lbits = vld1q_u8_x2(x[i].qs+32*j); + + uint8x16x4_t aux; + aux.val[0] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 4)), vandq_u8(lbits.val[0], vdupq_n_u8(0xf))); + aux.val[1] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 3)), vshrq_n_u8(lbits.val[0], 4)); + aux.val[2] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 2)), vandq_u8(lbits.val[1], vdupq_n_u8(0xf))); + aux.val[3] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 1)), vshrq_n_u8(lbits.val[1], 4)); + + process_pair(aux.val[0], bits.b1.val+0); + process_pair(aux.val[1], bits.b1.val+2); + process_pair(aux.val[2], bits.b2.val+0); + process_pair(aux.val[3], bits.b2.val+2); + + // The compiler crashes the moment I try to use vzip2q_u8!!! + //bits.b1.val[0] = vandq_u8(lbits.val[0], vdupq_n_u8(0xf)); + //bits.b1.val[2] = vshrq_n_u8(lbits.val[0], 4); + //bits.b2.val[0] = vandq_u8(lbits.val[1], vdupq_n_u8(0xf)); + //bits.b2.val[2] = vshrq_n_u8(lbits.val[1], 4); + //bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(m10, vshlq_n_u8(hbits, 4))); + //bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(m10, vshlq_n_u8(hbits, 3))); + //bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(m10, vshlq_n_u8(hbits, 2))); + //bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(m10, vshlq_n_u8(hbits, 1))); + + //auto t1 = vqtbl2q_s8(values[0], bits.b1.val[0]); + //auto t2 = vqtbl2q_s8(values[1], bits.b1.val[0]); + //bits.b1.val[0] = vzip1q_s8(t1, t2); + ////bits.b1.val[1] = vzip2q_u8(t1, t2); + //t1 = vqtbl2q_s8(values[0], bits.b1.val[2]); + //t2 = vqtbl2q_s8(values[1], bits.b1.val[2]); + //bits.b1.val[2] = vzip1q_s8(t1, t2); + ////bits.b1.val[3] = vzip2q_s8(t1, t2); + + //t1 = vqtbl2q_s8(values[0], bits.b2.val[0]); + //t2 = vqtbl2q_s8(values[1], bits.b2.val[0]); + //bits.b2.val[0] = vzip1q_s8(t1, t2); + ////bits.b2.val[1] = vzip2q_s8(t1, t2); + //t1 = vqtbl2q_s8(values[0], bits.b2.val[2]); + //t2 = vqtbl2q_s8(values[1], bits.b2.val[2]); + //bits.b2.val[2] = vzip1q_s8(t1, t2); + ////bits.b2.val[3] = vzip2q_s8(t1, t2); + } + static inline int16x8_t load_shift() { + static const int16_t k_shift[8] = {4, 2, 0, -2, -4, -6, -8, -10}; + return vld1q_s16(k_shift); + } + static inline void load_values(int8x16x2_t * values) { + static const int8_t k_values[64] = { + -63, -63, -40, -40, -40, -40, -23, -23, -23, -23, -23, -10, -10, -10, -10, 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 28, 28, 28, 28, 28, 47, 47, + -23, 13, -63, -10, 13, 47, -40, -23, 1, 13, 28, -63, 1, 13, 47, -23, -10, 1, 13, 28, -40, -23, -10, 1, 13, -63, -23, 1, 28, 47, -23, 13, + }; + values[0] = vld1q_s8_x2(k_values+ 0); + values[1] = vld1q_s8_x2(k_values+32); + } + static uint8x16x2_t load_shuffle() { + static const uint8_t k_shuff[32] = { + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 + }; + return vld1q_u8_x2(k_shuff); + } + + struct { uint8x16x4_t b1, b2; } bits; + uint8x16_t hbits; + const uint8x16x2_t shuff; + const int16x8_t shifts; + const uint8x16_t m10 = vdupq_n_u8(0x10); + int8x16x2_t values[2]; + +}; + template <int nrc_y> void mul_mat_iq4_ks_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); @@ -4241,6 +4582,142 @@ void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in } } +//struct DequantizerIQ2KL final : public BaseDequantizer<block_iq2_kl, true, true> { +// DequantizerIQ2KL(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), shuff(load_shuffle()), shifts(load_shift()) { load_values(values); } +// +// constexpr static int num_blocks() { return 8; } +// constexpr static bool should_scale_quants() { return false; } +// +// template <typename Q8> +// inline int32x4x2_t new_block(int i, [[maybe_unused]] const Q8& q8, [[maybe_unused]] float32x4_t * acc) { +// uint32_t aux32; std::memcpy(&aux32, x[i].scales_l, 4); +// auto scl = vand_u8(vdup_n_u8(0xf), vreinterpret_u8_u32(uint32x2_t{aux32, aux32 >> 4})); +// auto sch = vandq_u16(vshlq_u16(vdupq_n_u16(x[i].scales_h), shifts), vdupq_n_u16(0x30)); +// auto scales16 = vsubq_s16(vreinterpretq_s16_u16(vorrq_u16(sch, vmovl_u8(scl))), vdupq_n_s16(32)); +// int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))}; +// return scales; +// } +// inline void prepare(int i, int j) { +// hbits = j == 0 ? vld1q_u8(x[i].qh) : vshrq_n_u8(hbits, 4); +// auto lbits = vld1q_u8_x2(x[i].qs+32*j); +// +// uint8x16x4_t aux; +// aux.val[0] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 4)), vandq_u8(lbits.val[0], vdupq_n_u8(0xf))); +// aux.val[1] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 3)), vshrq_n_u8(lbits.val[0], 4)); +// aux.val[2] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 2)), vandq_u8(lbits.val[1], vdupq_n_u8(0xf))); +// aux.val[3] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 1)), vshrq_n_u8(lbits.val[1], 4)); +// +// process_pair(aux.val[0], bits.b1.val+0); +// process_pair(aux.val[1], bits.b1.val+2); +// process_pair(aux.val[2], bits.b2.val+0); +// process_pair(aux.val[3], bits.b2.val+2); +// +// } +// static inline int16x8_t load_shift() { +// } +// static inline void load_values(int8x16x2_t * values) { +// } +// static uint8x16x2_t load_shuffle() { +// return vld1q_u8_x2(k_shuff); +// } +// +// struct { uint8x16x4_t b1, b2; } bits; +// uint8x16_t hbits; +// const uint8x16x2_t shuff; +// const int16x8_t shifts; +// const uint8x16_t m10 = vdupq_n_u8(0x10); +// int8x16x2_t values[2]; +// +//}; + +void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) { + GGML_ASSERT(n%QK_K == 0); + GGML_ASSERT(nrc_x%8 == 0); + + int nb = n/QK_K; + + const block_iq2_kl * x8[8]; + + block_q8_k_r8 * y = (block_q8_k_r8 *)vy; + + ggml_half dh[8]; + float dnew[8]; + uint32_t block[8]; + int8_t ls[16]; + + int8x16x2_t xv[8]; + + const uint8x16_t m10 = vdupq_n_u8(0x10); + static const uint8_t k_shuff[32] = { + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 + }; + auto shuff = vld1q_u8_x2(k_shuff); + + int8x16x2_t values[2]; + static const int8_t k_values[64] = { + -63, -63, -40, -40, -40, -40, -23, -23, -23, -23, -23, -10, -10, -10, -10, 1, 1, 1, 1, 1, 13, 13, 13, 13, 13, 28, 28, 28, 28, 28, 47, 47, + -23, 13, -63, -10, 13, 47, -40, -23, 1, 13, 28, -63, 1, 13, 47, -23, -10, 1, 13, 28, -40, -23, -10, 1, 13, -63, -23, 1, 28, 47, -23, 13, + }; + values[0] = vld1q_s8_x2(k_values+ 0); + values[1] = vld1q_s8_x2(k_values+32); + + auto process_pair = [&values, &shuff] (uint8x16_t x, int8x16_t * val) { + int8x16x2_t aux{ vqtbl2q_s8(values[0], x), vqtbl2q_s8(values[1], x) }; + val[0] = vqtbl2q_s8(aux, shuff.val[0]); + val[1] = vqtbl2q_s8(aux, shuff.val[1]); + }; + + uint32_t sl32; + auto s8 = (const int8_t *)&sl32; + + for (int ix = 0; ix < nrc_x; ix += 8) { + for (int k = 0; k < 8; ++k) { + const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix+k)*bx); + dh[k] = dptr[0]; + x8[k] = (const block_iq2_kl *)(dptr + 1); + } + float32x4x2_t vd{vcvt_f32_f16(vld1_f16((const float16_t *)dh+0)), vcvt_f32_f16(vld1_f16((const float16_t *)dh+4))}; + for (int i = 0; i < nb; ++i) { + for (int k = 0; k < 8; ++k) { + uint32_t aux32; std::memcpy(&aux32, x8[k][i].scales_l, 4); + auto sh = x8[k][i].scales_h; + auto hbits = vld1q_u8(x8[k][i].qh); + for (int i128 = 0; i128 < 2; ++i128) { + + sl32 = aux32 & 0x0f0f0f0f; + ls[8*i128+0] = ls[8*i128+1] = (s8[0] | ((sh << 4) & 0x30)) - 32; + ls[8*i128+2] = ls[8*i128+3] = (s8[1] | ((sh << 2) & 0x30)) - 32; + ls[8*i128+4] = ls[8*i128+5] = (s8[2] | ((sh >> 0) & 0x30)) - 32; + ls[8*i128+6] = ls[8*i128+7] = (s8[3] | ((sh >> 2) & 0x30)) - 32; + sh >>= 8; aux32 >>= 4; + + auto lbits = vld1q_u8_x2(x8[k][i].qs+32*i128); + + uint8x16x4_t aux; + aux.val[0] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 4)), vandq_u8(lbits.val[0], vdupq_n_u8(0xf))); + aux.val[1] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 3)), vshrq_n_u8(lbits.val[0], 4)); + aux.val[2] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 2)), vandq_u8(lbits.val[1], vdupq_n_u8(0xf))); + aux.val[3] = vorrq_u8(vandq_u8(m10, vshlq_n_u8(hbits, 1)), vshrq_n_u8(lbits.val[1], 4)); + hbits = vshrq_n_u8(hbits, 4); + + process_pair(aux.val[0], xv[4*i128+0].val); + process_pair(aux.val[1], xv[4*i128+1].val); + process_pair(aux.val[2], xv[4*i128+2].val); + process_pair(aux.val[3], xv[4*i128+3].val); + } + dnew[k] = convert_to_q8_k_r8(1.f/125, xv, ls, block, (uint32_t *)y[i].qs + k); + } + auto d = vld1q_f32_x2(dnew); + d.val[0] = vmulq_f32(d.val[0], vd.val[0]); + d.val[1] = vmulq_f32(d.val[1], vd.val[1]); + vst1_f16((float16_t *)y[i].d + 0,vcvt_f16_f32(d.val[0])); + vst1_f16((float16_t *)y[i].d + 4,vcvt_f16_f32(d.val[1])); + } + y += nb; + } +} + void iqk_convert_iq4_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) { GGML_ASSERT(n%QK_K == 0); GGML_ASSERT(nrc_x%8 == 0); @@ -4683,6 +5160,7 @@ bool iqk_convert_iqk_quants_q80_r8(int type, int n, const void * vx, size_t bx, switch (ggml_type(type)) { case GGML_TYPE_IQ2_KS : iqk_convert_iq2_ks_q8_k_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_IQ2_K : iqk_convert_iq2_k_q8_k_r8 (n, vx, bx, vy, nrc_x); break; + case GGML_TYPE_IQ2_KL : iqk_convert_iq2_kl_q8_k_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_IQ3_KS : iqk_convert_iq3_ks_q8_k_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_IQ3_K : iqk_convert_iq3_k_q8_k_r8 (n, vx, bx, vy, nrc_x); break; case GGML_TYPE_IQ4_KS : iqk_convert_iq4_ks_q8_k_r8(n, vx, bx, vy, nrc_x); break; @@ -4710,6 +5188,9 @@ bool iqk_set_kernels_iqk_quants(int ne00, int typeA, int typeB, std::array<mul_m case GGML_TYPE_IQ2_K: IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_K_q8_K_T, DequantizerIQ2K, kernels); break; + case GGML_TYPE_IQ2_KL: + IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_K_q8_K_T, DequantizerIQ2KL, kernels); + break; case GGML_TYPE_IQ3_KS: IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_K_q8_K_T, DequantizerIQ3KS, kernels); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 0054f6cb..fb951e70 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -251,6 +251,7 @@ struct MulMat { case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type; case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; + case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; case GGML_TYPE_IQ3_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; case GGML_TYPE_IQ3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; case GGML_TYPE_IQ4_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; @@ -296,6 +297,7 @@ struct MulMat { case GGML_TYPE_IQ3_KT : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type; case GGML_TYPE_IQ4_KT : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type; case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; + case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; case GGML_TYPE_IQ3_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; case GGML_TYPE_IQ4_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; case GGML_TYPE_IQ5_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type; @@ -424,6 +426,7 @@ bool iqk_convert_repack(int typeA, int n, const void * vx, size_t bx, void * vy, return iqk_convert_iquants_q80_r8(typeA, n, vx, bx, vy, nrc_x); case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ3_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_KSS: @@ -827,14 +830,15 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { case GGML_TYPE_IQ3_XXS_R4: case GGML_TYPE_IQ3_S_R4: return iqk_set_kernels_iquants(ne00, typeA, typeB, mm.funcs, mm.func16); - case GGML_TYPE_IQ3_KS: - case GGML_TYPE_IQ4_KS: - case GGML_TYPE_IQ5_KS: - case GGML_TYPE_IQ4_KSS: - case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KL: + case GGML_TYPE_IQ3_KS: case GGML_TYPE_IQ3_K: + case GGML_TYPE_IQ4_KSS: + case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ4_K: + case GGML_TYPE_IQ5_KS: case GGML_TYPE_IQ5_K: case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ2_K_R4: @@ -909,6 +913,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) { return iqk_set_kernels_kquants(ne00, typeA, typeB, m.funcs, m.func16); case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KL: case GGML_TYPE_IQ3_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_KSS: diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 9095cda4..b38cc51f 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -1515,9 +1515,10 @@ void vec_dot_iq2_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx } // -// ============================================== iq3_k +// ======================================== iq2_kl // namespace { + const int8_t iq3nl_index[111] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 11, 11, 4, 4, 4, 4, @@ -1531,6 +1532,317 @@ inline int best_index_iq3nl(const int8_t * values, float x) { return ix < 8 ? ix : x - values[ix-8] < values[ix-7] - x ? ix-8 : ix-7; } +void quantize_row_iq2_kl_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales) { + constexpr int kBlockSize = 32; + constexpr float kSigmaFactor = 2.25f; + constexpr int ntry = 5; + static const int k_index[64] = {-1, -2, 0, -3, -4, 1, -5, -6, 2, -7, -8, 3, -9, 4, -10, 5, -11, 6, 7, -12, 8, 9, 10, -13, 11, -14, -15, -16, 12, 13, -17, + 14, -18, -19, 15, 16, 17, 18, 19, -20, -21, 20, 21, 22, 23, 24, -22, -23, 25, -24, 26, -25, 27, -26, 28, 29, -27, -28, 30, -29, -30, 31, -31, -32}; + static const std::vector<std::vector<int>> k_neighbours = { + { 2, 0, 6, 11, 7, 3, 8, 15, }, + { 0, 2, 3, 6, 7, 1, 8, 4, }, + { 0, 1, 3, 4, 8, 7, 9, 6, }, + { 1, 0, 3, 4, 8, 9, 7, 10, }, + { 1, 4, 5, 10, 9, 3, 8, 0, }, + { 5, 1, 4, 10, 9, 14, 8, 3, }, + { 6, 2, 7, 0, 3, 11, 8, 15, }, + { 3, 7, 0, 6, 8, 4, 12, 9, }, + { 3, 4, 8, 9, 1, 7, 12, 10, }, + { 4, 10, 5, 9, 1, 8, 13, 14, }, + { 11, 2, 6, 7, 20, 15, 25, 21, }, + { 8, 7, 3, 12, 9, 16, 17, 13, }, + { 14, 5, 10, 19, 9, 13, 4, 18, }, + { 6, 15, 7, 11, 20, 21, 16, 2, }, + { 15, 7, 16, 6, 21, 12, 17, 22, }, + { 12, 16, 17, 8, 15, 7, 13, 22, }, + { 19, 10, 13, 18, 14, 9, 12, 24, }, + { 11, 20, 25, 6, 15, 2, 21, 7, }, + { 20, 15, 21, 6, 11, 7, 16, 26, }, + { 14, 19, 29, 10, 28, 18, 13, 24, }, + { 25, 11, 20, 21, 15, 6, 26, 30, }, + { 19, 24, 28, 18, 29, 23, 13, 17, }, + { 29, 19, 14, 28, 24, 18, 10, 13, }, + { 20, 26, 21, 25, 30, 15, 22, 16, }, + { 27, 26, 22, 23, 21, 30, 16, 24, }, + { 27, 24, 28, 31, 23, 18, 22, 17, }, + { 25, 30, 20, 26, 21, 11, 15, 22, }, + { 30, 26, 25, 20, 21, 27, 22, 15, }, + { 30, 27, 31, 26, 22, 23, 21, 24, }, + { 31, 27, 30, 26, 28, 23, 22, 24, }, + { 31, 28, 29, 27, 24, 23, 19, 18, }, + { 29, 28, 31, 24, 19, 27, 14, 18, }, + }; + auto values = iq3nl_values; + std::pair<int8_t, int8_t> grid[32]; + for (int j = 0; j < 64; ++j) { + if (int i = k_index[j]; i >= 0) { + int i1 = j/8, i2 = j%8; + grid[i] = {values[i1], values[i2]}; + } + } + + ggml_half * dptr = (ggml_half *)vy; + auto y = (block_iq2_kl *)(dptr + 1); + + float weight[kBlockSize]; + + auto index = [&grid, values] (float id, float x1, float x2, float w1, float w2) { + float sx1 = id*x1; + float sx2 = id*x2; + int l1 = best_index_iq3nl(values, sx1); + int l2 = best_index_iq3nl(values, sx2); + int i = k_index[8*l1 + l2]; + if (i >= 0) return i; + auto& neigh = k_neighbours[-i-1]; + float best = std::numeric_limits<float>::max(); + int ibest = -1; + for (auto& n : neigh) { + float diff1 = grid[n].first - sx1; + float diff2 = grid[n].second - sx2; + float score = w1*diff1*diff1 + w2*diff2*diff2; + if (score < best) { + best = score; ibest = n; + } + } + GGML_ASSERT(ibest >= 0); + return ibest; + }; + + float max_scale = 0, max_abs_scale = 0; + + for (int ibl = 0; ibl < n_per_row/QK_K; ++ibl) { + std::memset(&y[ibl], 0, sizeof(block_iq2_kl)); + auto scales = all_scales + ibl*(QK_K/kBlockSize); + auto xbl = x + ibl*QK_K; + float sigma2 = 0; + for (int j = 0; j < QK_K; ++j) sigma2 += xbl[j]*xbl[j]; + sigma2 *= kSigmaFactor/QK_K; + for (int ib = 0; ib < QK_K/kBlockSize; ++ib) { + auto xb = xbl + ib*kBlockSize; + if (quant_weights) { + auto qw = quant_weights + ibl*QK_K + ib*kBlockSize; + for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j]*sqrt(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = std::abs(xb[j]); //xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < kBlockSize; ++j) { + float ax = std::abs(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (!amax) { + scales[ib] = 0; + continue; + } + float d = ntry > 0 ? -max/values[0] : max/values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + for (int j = 0; j < kBlockSize; j += 2) { + float w1 = weight[j+0]; + float w2 = weight[j+1]; + int idx = index(id, xb[j+0], xb[j+1], w1, w2); + float q1 = grid[idx].first ; + float q2 = grid[idx].second; + sumqx_p += w1*q1*xb[j] + w2*q2*xb[j+1]; + sumq2_p += w1*q1*q1 + w2*q2*q2; + idx = index(-id, xb[j+0], xb[j+1], w1, w2); + q1 = grid[idx].first ; + q2 = grid[idx].second; + sumqx_m += w1*q1*xb[j] + w2*q2*xb[j+1]; + sumq2_m += w1*q1*q1 + w2*q2*q2; + } + d = sumqx_p/sumq2_p; + float best = d*sumqx_p; + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry + values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < kBlockSize; j += 2) { + float w1 = weight[j+0]; + float w2 = weight[j+1]; + int idx = index(id, xb[j+0], xb[j+1], w1, w2); + float q1 = grid[idx].first ; + float q2 = grid[idx].second; + sumqx_p += w1*q1*xb[j] + w2*q2*xb[j+1]; + sumq2_p += w1*q1*q1 + w2*q2*q2; + idx = index(-id, xb[j+0], xb[j+1], w1, w2); + q1 = grid[idx].first ; + q2 = grid[idx].second; + sumqx_m += w1*q1*xb[j] + w2*q2*xb[j+1]; + sumq2_m += w1*q1*q1 + w2*q2*q2; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; + } + } + scales[ib] = d; + float ad = std::abs(d); + if (ad > max_abs_scale) { + max_abs_scale = ad; max_scale = d; + } + } + } + + if (!max_abs_scale) { + dptr[0] = GGML_FP32_TO_FP16(0.f); + return; + } + + float d = -max_scale/32; + float id = 1/d; + + float sumqx = 0, sumq2 = 0; + for (int ibl = 0; ibl < n_per_row/QK_K; ++ibl) { + auto scales = all_scales + ibl*(QK_K/kBlockSize); + auto xbl = x + ibl*QK_K; + float sigma2 = 0; + for (int j = 0; j < QK_K; ++j) sigma2 += xbl[j]*xbl[j]; + sigma2 *= kSigmaFactor/QK_K; + for (int ib = 0; ib < QK_K/kBlockSize; ++ib) { + auto xb = xbl + ib*kBlockSize; + if (quant_weights) { + auto qw = quant_weights + ibl*QK_K + ib*kBlockSize; + for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j]*sqrt(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = std::abs(xb[j]); //xb[j]*xb[j]; + } + int ls = nearest_int(id*scales[ib]); + ls = std::max(-32, std::min(31, ls)); + int lsmin = std::max(-32, ls-1); + int lsmax = std::min( 31, ls+1); + float best_score = std::numeric_limits<float>::max(); + int best_ls = ls; + for (int ils = lsmin; ils <= lsmax; ++ils) { + float dl = d*ils; + float idl = dl ? 1/dl : 0.f; + float score = 0; + for (int j = 0; j < kBlockSize/2; ++j) { + float w1 = weight[2*j+0]; + float w2 = weight[2*j+1]; + int idx = index(idl, xb[2*j+0], xb[2*j+1], w1, w2); + float diff1 = dl*grid[idx].first - xb[2*j+0]; + float diff2 = dl*grid[idx].second - xb[2*j+1]; + score += w1*diff1*diff1 + w2*diff2*diff2; + } + if (score < best_score) { + best_score = score; + best_ls = ils; + } + } + ls = best_ls; + int uls = ls + 32; + y[ibl].scales_l[ib%4] |= ((uls & 0xf) << 4*(ib/4)); + y[ibl].scales_h |= ((uls >> 4) << 2*ib); + if (ls == 0) continue; + float dl = d*ls; + float idl = 1/dl; + for (int j = 0; j < kBlockSize/2; ++j) { + float w1 = weight[2*j+0]; + float w2 = weight[2*j+1]; + int idx = index(idl, xb[2*j+0], xb[2*j+1], w1, w2); + y[ibl].qs[16*(ib/2) + j] |= ((idx & 0xf) << 4*(ib%2)); + y[ibl].qh[j] |= ((idx >> 4) << ib); + float q1 = ls*grid[idx].first ; + float q2 = ls*grid[idx].second; + sumqx += w1*q1*xb[2*j] + w2*q2*xb[2*j+1]; + sumq2 += w1*q1*q1 + w2*q2*q2; + } + } + } + if (sumq2 > 0) d = sumqx/sumq2; + + dptr[0] = GGML_FP32_TO_FP16(1.025f * d); + +} +} + +void quantize_row_iq2_kl_ref(const float * x, block_iq2_kl * y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq2_kl(x, (void *)y, 1, k, nullptr); +} + +void quantize_row_iq2_kl(const float * x, void * vy, int64_t k) { + assert(k % QK_K == 0); + block_iq2_kl * y = (block_iq2_kl *)vy; + quantize_row_iq2_kl_ref(x, y, k); +} + +size_t quantize_iq2_kl(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + constexpr int kBlockSize = 32; + GGML_ASSERT(n_per_row%QK_K == 0); + auto row_size = ggml_row_size(GGML_TYPE_IQ2_KL, n_per_row); + int nblock = n_per_row/QK_K; + std::vector<float> all_scales(nblock*(QK_K/kBlockSize)); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrows; ++row) { + quantize_row_iq2_kl_impl(src, (void *)qrow, n_per_row, imatrix, all_scales.data()); + src += n_per_row; + qrow += row_size; + } + return nrows * row_size; +} + +void dequantize_row_iq2_kl(const block_iq2_kl * x, float * y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + const ggml_half * dptr = (const ggml_half *)x; + const float d = GGML_FP16_TO_FP32(*dptr); + x = (const block_iq2_kl *)(dptr + 1); + + for (int i = 0; i < nb; i++) { + + auto qs = x[i].qs; + auto qh = x[i].qh; + auto scales_h = x[i].scales_h; + + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + float dl1 = d * (int(((x[i].scales_l[(2*ib64+0)%4] >> 4*(ib64/2)) & 0xf) | (((scales_h >> (4*ib64+0)) & 3) << 4)) - 32); + float dl2 = d * (int(((x[i].scales_l[(2*ib64+1)%4] >> 4*(ib64/2)) & 0xf) | (((scales_h >> (4*ib64+2)) & 3) << 4)) - 32); + for (int j = 0; j < 16; ++j) { + const int8_t * val1 = (const int8_t *)(iq2kl_values + ((qs[j] & 0xf) | (((qh[j] >> (2*ib64+0)) & 1) << 4))); + const int8_t * val2 = (const int8_t *)(iq2kl_values + ((qs[j] >> 4) | (((qh[j] >> (2*ib64+1)) & 1) << 4))); + y[2*j+ 0] = dl1 * val1[0]; + y[2*j+ 1] = dl1 * val1[1]; + y[2*j+32] = dl2 * val2[0]; + y[2*j+33] = dl2 * val2[1]; + } + y += 64; + qs += 16; + } + + } +} + +void vec_dot_iq2_kl_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); + +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_KL, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { + return; + } +#endif +} + +// +// ============================================== iq3_k +// +namespace { + static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) { constexpr int ntry = 3; diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 3fc02a5e..75fa9b4e 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -73,6 +73,12 @@ size_t quantize_iq2_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst void dequantize_row_iq2_ks(const block_iq2_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_iq2_kl_ref(const float * GGML_RESTRICT x, block_iq2_kl * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_kl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_iq2_kl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_iq2_kl(const block_iq2_kl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_iq2_kl_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); size_t quantize_iq2_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); |