summaryrefslogtreecommitdiff
path: root/ggml
diff options
context:
space:
mode:
Diffstat (limited to 'ggml')
-rw-r--r--ggml/include/ggml.h3
-rw-r--r--ggml/src/ggml-quants.c1
-rw-r--r--ggml/src/ggml-quants.h7
-rw-r--r--ggml/src/ggml.c39
-rw-r--r--ggml/src/iqk/iqk_mul_mat.cpp421
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp272
-rw-r--r--ggml/src/iqk/iqk_quantize.h16
7 files changed, 727 insertions, 32 deletions
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 09f92eb9..962e9688 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -406,6 +406,7 @@ extern "C" {
GGML_TYPE_IQ4_KS = 144,
GGML_TYPE_IQ2_KS = 145,
GGML_TYPE_IQ4_KSS = 146,
+ GGML_TYPE_Q8_K16 = 147,
GGML_TYPE_Q4_0_R4 = 202,
GGML_TYPE_Q5_0_R4 = 206,
@@ -413,6 +414,7 @@ extern "C" {
GGML_TYPE_IQ4_NL_X4 = 220, // TODO: rename GGML_TYPE_IQ4_NL_X4 to GGML_TYPE_IQ4_NL_R4
GGML_TYPE_IQ4_XS_R4 = 223,
GGML_TYPE_Q6_0_R4 = 233,
+ GGML_TYPE_IQ2_BN_R4 = 335,
GGML_TYPE_COUNT,
};
@@ -478,6 +480,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_NL_X4 = 219, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors
GGML_FTYPE_MOSTLY_Q6_0_R4 = 227, // except 1d tensors
+ GGML_FTYPE_MOSTLY_IQ2_BN_R4 = 329, // except 1d tensors
};
// available tensor operations:
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 4fdd2c36..a9511f00 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -15218,6 +15218,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
case GGML_TYPE_I64:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
+ case GGML_TYPE_IQ2_BN_R4:
// nothing to validate
break;
default:
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index a40a6d37..b6d69011 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -33,7 +33,6 @@ void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_REST
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_xxs_ref(const float * GGML_RESTRICT x, block_iq2_xxs * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_xs_ref (const float * GGML_RESTRICT x, block_iq2_xs * GGML_RESTRICT y, int64_t k);
@@ -43,7 +42,6 @@ void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGM
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_bn_ref (const float * GGML_RESTRICT x, block_iq2_bn * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -59,7 +57,6 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -69,7 +66,6 @@ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
// Dequantization
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -97,7 +93,6 @@ void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq1_bn (const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_iq2_bn (const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
// Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -123,7 +118,6 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@@ -136,7 +130,6 @@ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT ds
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq1_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq2_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f4320e99..12afec05 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1026,11 +1026,24 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.to_float = (ggml_to_float_t) dequantize_row_iq2_bn,
.from_float = quantize_row_iq2_bn,
.from_float_ref = (ggml_from_float_t)quantize_row_iq2_bn_ref,
- .vec_dot = ggml_vec_dot_iq2_bn_q8_K64,
+ .vec_dot = vec_dot_iq2_bn_q8_K64,
.vec_dot_type = GGML_TYPE_Q8_K64,
.nrows = 1,
.row_meta_size = 4,
},
+ [GGML_TYPE_IQ2_BN_R4] = {
+ .type_name = "iq2_bn_r4",
+ .blck_size = QK_IQ1BN,
+ .type_size = sizeof(block_iq2_bn),
+ .is_quantized = true,
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_bn_r4,
+ .from_float = quantize_row_iq2_bn_r4,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq2_bn_r4_ref,
+ .vec_dot = vec_dot_iq2_bn_r4_q8_K64,
+ .vec_dot_type = GGML_TYPE_Q8_K16,
+ .nrows = 1,
+ .row_meta_size = 4,
+ },
[GGML_TYPE_IQ4_NL] = {
.type_name = "iq4_nl",
.blck_size = QK4_NL,
@@ -1103,6 +1116,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float = quantize_row_q8_K64,
.row_meta_size = 0,
},
+ [GGML_TYPE_Q8_K16] = {
+ .type_name = "q8_K16",
+ .blck_size = 64,
+ .type_size = 64,
+ .is_quantized = true,
+ .from_float = quantize_row_q8_K16,
+ .row_meta_size = 20,
+ },
[GGML_TYPE_BF16] = {
.type_name = "bf16",
.blck_size = 1,
@@ -4000,6 +4021,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
case GGML_FTYPE_MOSTLY_IQ1_BN: wtype = GGML_TYPE_IQ1_BN; break;
case GGML_FTYPE_MOSTLY_IQ2_BN: wtype = GGML_TYPE_IQ2_BN; break;
+ case GGML_FTYPE_MOSTLY_IQ2_BN_R4: wtype = GGML_TYPE_IQ2_BN_R4;break;
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
case GGML_FTYPE_MOSTLY_IQ4_NL_X4: wtype = GGML_TYPE_IQ4_NL_X4;break;
case GGML_FTYPE_MOSTLY_IQ4_XS_R4: wtype = GGML_TYPE_IQ4_XS_R4;break;
@@ -10529,6 +10551,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
+ case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
@@ -10977,6 +11000,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
+ case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
@@ -11122,6 +11146,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
+ case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
@@ -14313,6 +14338,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
+ case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
@@ -14698,6 +14724,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
+ case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
@@ -14977,6 +15004,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
+ case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
@@ -15583,6 +15611,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
+ case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
@@ -15603,6 +15632,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K:
case GGML_TYPE_Q8_K64:
+ case GGML_TYPE_Q8_K16:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -20476,7 +20506,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
if (node->src[1]->type != vec_dot_type) {
- cur_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+ cur_q = ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
+ //cur_q = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
}
} break;
case GGML_OP_MUL_MAT_ID:
@@ -20486,7 +20517,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
const struct ggml_tensor * src1 = node->src[1];
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
if (src1->type != vec_dot_type) {
- cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
+ cur_q += ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
+ //cur_q += ggml_row_size(vec_dot_type, ggml_nelements(src1));
}
const int n_as = src0->ne[2];
cur_q += GGML_PAD(cur, sizeof(int64_t)); // align
@@ -22415,6 +22447,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_BN: result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ2_BN_R4:result = quantize_iq2_bn_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL_X4: result = quantize_iq4_nl_x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS_R4: result = quantize_iq4_xs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index faa4cab7..b6ff7ab7 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -161,6 +161,17 @@ struct MulMat {
}
}
static bool prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny);
+ static inline int num_rows(ggml_type type) {
+ switch (type) {
+ case GGML_TYPE_Q4_0_R4:
+ case GGML_TYPE_Q5_0_R4:
+ case GGML_TYPE_Q6_0_R4:
+ case GGML_TYPE_Q8_0_R4:
+ case GGML_TYPE_IQ4_NL_X4:
+ case GGML_TYPE_IQ2_BN_R4: return 4;
+ default: return 1;
+ }
+ }
private:
template <typename Dequantizer> static void set_functions(MulMat& m);
};
@@ -181,13 +192,15 @@ bool iqk_mul_mat(long Nx, long Ny, long ne00,
size_t row_size_qy = strideB; //*ggml_type_size(ggml_type(typeB));
//if (ith == 0) printf("%s: ne00 = %d, row_size_qx = %d, strideA = %d\n", __func__, int(ne00), int(row_size_qx), int(strideA));
- auto nrc_x = (Nx + nth - 1)/nth;
+ auto num_rows = MulMat::num_rows(ggml_type(typeA));
+ GGML_ASSERT(Nx%num_rows == 0);
+ auto nrc_x = (Nx/num_rows + nth - 1)/nth;
auto first_x = ith*nrc_x;
- if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
+ if (first_x + nrc_x > Nx/num_rows) nrc_x = Nx/num_rows - first_x;
- DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};
+ DataInfo info{C + first_x*num_rows, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};
- mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
+ mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x*num_rows, row_size_qx, info, nrc_x*num_rows, Ny);
return true;
}
@@ -319,6 +332,30 @@ template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
const block_q8 * y[nrc_y];
};
+template <int nrc> struct Q8_16 {
+
+ constexpr static int nrc_y = nrc;
+
+ Q8_16(const DataInfo& info) {
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto ptr = (const float *)info.src1_row(iy);
+ std::memcpy(d + 5*iy, ptr, 5*sizeof(float));
+ y[iy] = (const int8_t *)(ptr + 5);
+ }
+ }
+
+#ifdef HAVE_FANCY_SIMD
+ inline __m512i load_quants64(int iy, int i) const { return _mm512_loadu_si512((const __m512i*)y[iy] + i); }
+#endif
+ inline __m256i load_quants(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy] + i); }
+ inline float scale(int iy, int k) const { return d[5*iy+k]; }
+ inline float sum_row(int iy) const { return d[5*iy + 4]; }
+ inline __m128 scale(int iy) const { return _mm_loadu_ps(d + 5*iy); }
+
+ float d[5*nrc_y];
+ const int8_t * y[nrc_y];
+};
+
struct Scales8KBase {
template <typename Q8>
inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
@@ -2079,6 +2116,228 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
#endif // Zen4 or vanilla AVX2
+template <int nrc_y>
+static void mul_mat_iq2_bn_r4_q8_k16_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+ if (nrc_x%4) {
+ printf("%s: %d is not a multiple of 4\n", __func__, nrc_x);
+ GGML_ABORT("fatal error");
+ }
+ Q8_16<nrc_y> q8(info);
+ auto m3 = _mm256_set1_epi8(0x3);
+ auto m1 = _mm256_set1_epi16(1);
+ int nb = n / QK_IQ1BN;
+ __m256i qx[4];
+ if constexpr (nrc_y > 4) {
+ __m256i acc[nrc_y] = {};
+ __m128 sum4[nrc_y];
+ for (int ix = 0; ix < nrc_x; ix += 4) {
+ const float * dptr = (const float *)((const char *)vx + ix*bx);
+ auto dl = _mm_loadu_ps(dptr);
+ const uint8_t * iq2l = (const uint8_t *)(dptr + 4);
+ for (int ib = 0; ib < nb; ++ib) {
+ auto bits = _mm256_loadu_si256((const __m256i *)iq2l + 2*ib+0);
+ qx[0] = _mm256_and_si256(bits, m3);
+ qx[1] = _mm256_and_si256(_mm256_srli_epi16(bits, 2), m3);
+ qx[2] = _mm256_and_si256(_mm256_srli_epi16(bits, 4), m3);
+ qx[3] = _mm256_and_si256(_mm256_srli_epi16(bits, 6), m3);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = q8.load_quants(iy, 2*ib+0);
+ auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[0], _mm256_shuffle_epi32(y, 0x00)),
+ _mm256_maddubs_epi16(qx[1], _mm256_shuffle_epi32(y, 0x55)));
+ auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[2], _mm256_shuffle_epi32(y, 0xaa)),
+ _mm256_maddubs_epi16(qx[3], _mm256_shuffle_epi32(y, 0xff)));
+ acc[iy] = _mm256_add_epi32(acc[iy], _mm256_madd_epi16(m1, _mm256_add_epi16(sumi1, sumi2)));
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto dy = q8.scale(iy);
+ auto sumf1 = _mm256_cvtepi32_ps(acc[iy]);
+ auto s4 = _mm_mul_ps(_mm256_extractf128_ps(sumf1, 0), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0x00)));
+ s4 = _mm_fmadd_ps(_mm256_extractf128_ps(sumf1, 1), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0x55)), s4);
+ sum4[iy] = _mm_fmadd_ps(dl, _mm_set1_ps(-q8.sum_row(iy)), s4);
+ acc[iy] = _mm256_setzero_si256();
+ }
+ for (int ib = 0; ib < nb; ++ib) {
+ auto bits = _mm256_loadu_si256((const __m256i *)iq2l + 2*ib+1);
+ qx[0] = _mm256_and_si256(bits, m3);
+ qx[1] = _mm256_and_si256(_mm256_srli_epi16(bits, 2), m3);
+ qx[2] = _mm256_and_si256(_mm256_srli_epi16(bits, 4), m3);
+ qx[3] = _mm256_and_si256(_mm256_srli_epi16(bits, 6), m3);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = q8.load_quants(iy, 2*ib+1);
+ auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[0], _mm256_shuffle_epi32(y, 0x00)),
+ _mm256_maddubs_epi16(qx[1], _mm256_shuffle_epi32(y, 0x55)));
+ auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[2], _mm256_shuffle_epi32(y, 0xaa)),
+ _mm256_maddubs_epi16(qx[3], _mm256_shuffle_epi32(y, 0xff)));
+ acc[iy] = _mm256_add_epi32(acc[iy], _mm256_madd_epi16(m1, _mm256_add_epi16(sumi1, sumi2)));
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto dy = q8.scale(iy);
+ auto sumf1 = _mm256_cvtepi32_ps(acc[iy]);
+ auto s4 = _mm_fmadd_ps(_mm256_extractf128_ps(sumf1, 0), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0xaa)), sum4[iy]);
+ s4 = _mm_fmadd_ps(_mm256_extractf128_ps(sumf1, 1), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0xff)), s4);
+ info.store(ix, iy, s4);
+ acc[iy] = _mm256_setzero_si256();
+ }
+ }
+ } else {
+ __m256i acc[2*nrc_y] = {};
+ for (int ix = 0; ix < nrc_x; ix += 4) {
+ const float * dptr = (const float *)((const char *)vx + ix*bx);
+ auto dl = _mm_loadu_ps(dptr);
+ const uint8_t * iq2l = (const uint8_t *)(dptr + 4);
+ for (int ib = 0; ib < nb; ++ib) {
+ auto bits = _mm256_loadu_si256((const __m256i *)iq2l + 2*ib+0);
+ qx[0] = _mm256_and_si256(bits, m3);
+ qx[1] = _mm256_and_si256(_mm256_srli_epi16(bits, 2), m3);
+ qx[2] = _mm256_and_si256(_mm256_srli_epi16(bits, 4), m3);
+ qx[3] = _mm256_and_si256(_mm256_srli_epi16(bits, 6), m3);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = q8.load_quants(iy, 2*ib+0);
+ auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[0], _mm256_shuffle_epi32(y, 0x00)),
+ _mm256_maddubs_epi16(qx[1], _mm256_shuffle_epi32(y, 0x55)));
+ auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[2], _mm256_shuffle_epi32(y, 0xaa)),
+ _mm256_maddubs_epi16(qx[3], _mm256_shuffle_epi32(y, 0xff)));
+ acc[2*iy+0] = _mm256_add_epi32(acc[2*iy+0], _mm256_madd_epi16(m1, _mm256_add_epi16(sumi1, sumi2)));
+ }
+ bits = _mm256_loadu_si256((const __m256i *)iq2l + 2*ib+1);
+ qx[0] = _mm256_and_si256(bits, m3);
+ qx[1] = _mm256_and_si256(_mm256_srli_epi16(bits, 2), m3);
+ qx[2] = _mm256_and_si256(_mm256_srli_epi16(bits, 4), m3);
+ qx[3] = _mm256_and_si256(_mm256_srli_epi16(bits, 6), m3);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = q8.load_quants(iy, 2*ib+1);
+ auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[0], _mm256_shuffle_epi32(y, 0x00)),
+ _mm256_maddubs_epi16(qx[1], _mm256_shuffle_epi32(y, 0x55)));
+ auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[2], _mm256_shuffle_epi32(y, 0xaa)),
+ _mm256_maddubs_epi16(qx[3], _mm256_shuffle_epi32(y, 0xff)));
+ acc[2*iy+1] = _mm256_add_epi32(acc[2*iy+1], _mm256_madd_epi16(m1, _mm256_add_epi16(sumi1, sumi2)));
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto dy = q8.scale(iy);
+ auto sumf1 = _mm256_cvtepi32_ps(acc[2*iy+0]);
+ auto sumf2 = _mm256_cvtepi32_ps(acc[2*iy+1]);
+ auto sum4 = _mm_mul_ps(_mm256_extractf128_ps(sumf1, 0), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0x00)));
+ sum4 = _mm_fmadd_ps(_mm256_extractf128_ps(sumf1, 1), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0x55)), sum4);
+ sum4 = _mm_fmadd_ps(_mm256_extractf128_ps(sumf2, 0), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0xaa)), sum4);
+ sum4 = _mm_fmadd_ps(_mm256_extractf128_ps(sumf2, 1), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0xff)), sum4);
+ sum4 = _mm_fmadd_ps(dl, _mm_set1_ps(-q8.sum_row(iy)), sum4);
+ info.store(ix, iy, sum4);
+ acc[2*iy+0] = acc[2*iy+1] = _mm256_setzero_si256();
+ }
+ }
+ }
+}
+
+#ifdef HAVE_FANCY_SIMD
+template <int nrc_y>
+static void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+ if (nrc_x%4) {
+ printf("%s: %d is not a multiple of 4\n", __func__, nrc_x);
+ GGML_ABORT("fatal error");
+ }
+ if constexpr (nrc_y == 1) {
+ mul_mat_iq2_bn_r4_q8_k16_avx2<1>(n, vx, bx, info, nrc_x);
+ } else {
+ Q8_16<nrc_y> q8(info);
+ auto m3 = _mm512_set1_epi8(0x3);
+ int nb = n / QK_IQ1BN;
+ __m512i acc[2*nrc_y] = {};
+ __m512i qx[8];
+ for (int ix = 0; ix < nrc_x/8; ++ix) {
+ const float * dptr1 = (const float *)((const char *)vx + (8*ix+0)*bx);
+ const float * dptr2 = (const float *)((const char *)vx + (8*ix+4)*bx);
+ auto dl = _mm_loadu_ps(dptr1);
+ auto dh = _mm_loadu_ps(dptr2);
+ const uint8_t * iq2l = (const uint8_t *)(dptr1 + 4);
+ const uint8_t * iq2h = (const uint8_t *)(dptr2 + 4);
+ for (int ib = 0; ib < nb; ++ib) {
+ auto bits_l = _mm512_loadu_si512((const __m512i *)iq2l + ib);
+ auto bits_h = _mm512_loadu_si512((const __m512i *)iq2h + ib);
+ qx[0] = _mm512_and_si512(bits_l, m3);
+ qx[1] = _mm512_and_si512(bits_h, m3);
+ qx[2] = _mm512_and_si512(_mm512_srli_epi16(bits_l, 2), m3);
+ qx[3] = _mm512_and_si512(_mm512_srli_epi16(bits_h, 2), m3);
+ qx[4] = _mm512_and_si512(_mm512_srli_epi16(bits_l, 4), m3);
+ qx[5] = _mm512_and_si512(_mm512_srli_epi16(bits_h, 4), m3);
+ qx[6] = _mm512_and_si512(_mm512_srli_epi16(bits_l, 6), m3);
+ qx[7] = _mm512_and_si512(_mm512_srli_epi16(bits_h, 6), m3);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = q8.load_quants64(iy, ib);
+ auto sy = _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x00));
+ acc[2*iy+0] = _mm512_dpbusd_epi32(acc[2*iy+0], qx[0], sy);
+ acc[2*iy+1] = _mm512_dpbusd_epi32(acc[2*iy+1], qx[1], sy);
+ sy = _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x55));
+ acc[2*iy+0] = _mm512_dpbusd_epi32(acc[2*iy+0], qx[2], sy);
+ acc[2*iy+1] = _mm512_dpbusd_epi32(acc[2*iy+1], qx[3], sy);
+ sy = _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xaa));
+ acc[2*iy+0] = _mm512_dpbusd_epi32(acc[2*iy+0], qx[4], sy);
+ acc[2*iy+1] = _mm512_dpbusd_epi32(acc[2*iy+1], qx[5], sy);
+ sy = _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xff));
+ acc[2*iy+0] = _mm512_dpbusd_epi32(acc[2*iy+0], qx[6], sy);
+ acc[2*iy+1] = _mm512_dpbusd_epi32(acc[2*iy+1], qx[7], sy);
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto dy = q8.scale(iy);
+ __m128 sum4;
+ for (int k = 0; k < 2; ++k) {
+ const auto& dx = k == 0 ? dl : dh;
+ auto sumf = _mm512_cvtepi32_ps(acc[2*iy+k]);
+ sum4 = _mm_mul_ps (_mm512_extractf32x4_ps(sumf, 0), _mm_mul_ps(dx, _mm_shuffle_ps(dy, dy, 0x00)));
+ sum4 = _mm_fmadd_ps(_mm512_extractf32x4_ps(sumf, 1), _mm_mul_ps(dx, _mm_shuffle_ps(dy, dy, 0x55)), sum4);
+ sum4 = _mm_fmadd_ps(_mm512_extractf32x4_ps(sumf, 2), _mm_mul_ps(dx, _mm_shuffle_ps(dy, dy, 0xaa)), sum4);
+ sum4 = _mm_fmadd_ps(_mm512_extractf32x4_ps(sumf, 3), _mm_mul_ps(dx, _mm_shuffle_ps(dy, dy, 0xff)), sum4);
+ sum4 = _mm_fmadd_ps(dx, _mm_set1_ps(-q8.sum_row(iy)), sum4);
+ info.store(8*ix + 4*k, iy, sum4);
+ }
+ acc[2*iy+0] = acc[2*iy+1] = _mm512_setzero_si512();
+ }
+ }
+ if (int ix = 8*(nrc_x/8); ix < nrc_x) {
+ const float * dptr = (const float *)((const char *)vx + ix*bx);
+ auto dl = _mm_loadu_ps(dptr);
+ const uint8_t * iq2l = (const uint8_t *)(dptr + 4);
+ for (int ib = 0; ib < nb; ++ib) {
+ auto bits_l = _mm512_loadu_si512((const __m512i *)iq2l + ib);
+ qx[0] = _mm512_and_si512(bits_l, m3);
+ qx[1] = _mm512_and_si512(_mm512_srli_epi16(bits_l, 2), m3);
+ qx[2] = _mm512_and_si512(_mm512_srli_epi16(bits_l, 4), m3);
+ qx[3] = _mm512_and_si512(_mm512_srli_epi16(bits_l, 6), m3);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = q8.load_quants64(iy, ib);
+ acc[iy] = _mm512_dpbusd_epi32(acc[iy], qx[0], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x00)));
+ acc[iy] = _mm512_dpbusd_epi32(acc[iy], qx[1], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x55)));
+ acc[iy] = _mm512_dpbusd_epi32(acc[iy], qx[2], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xaa)));
+ acc[iy] = _mm512_dpbusd_epi32(acc[iy], qx[3], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xff)));
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto dy = q8.scale(iy);
+ auto sumf = _mm512_cvtepi32_ps(acc[iy]);
+ auto sum4 = _mm_mul_ps(_mm512_extractf32x4_ps(sumf, 0), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0x00)));
+ sum4 = _mm_fmadd_ps(_mm512_extractf32x4_ps(sumf, 1), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0x55)), sum4);
+ sum4 = _mm_fmadd_ps(_mm512_extractf32x4_ps(sumf, 2), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0xaa)), sum4);
+ sum4 = _mm_fmadd_ps(_mm512_extractf32x4_ps(sumf, 3), _mm_mul_ps(dl, _mm_shuffle_ps(dy, dy, 0xff)), sum4);
+ sum4 = _mm_fmadd_ps(dl, _mm_set1_ps(-q8.sum_row(iy)), sum4);
+ info.store(ix, iy, sum4);
+ }
+ }
+ }
+}
+#else
+template <int nrc_y>
+static void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+ if (nrc_x%4) {
+ printf("%s: %d is not a multiple of 4\n", __func__, nrc_x);
+ GGML_ABORT("fatal error");
+ }
+ mul_mat_iq2_bn_r4_q8_k16_avx2<nrc_y>(n, vx, bx, info, nrc_x);
+}
+#endif
+
#ifdef HAVE_FANCY_SIMD
template <int nrc_y>
static void mul_mat_iq4_nl_x4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
@@ -4744,6 +5003,20 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
mm.funcs[7] = mul_mat_iq2bn_q8_K64<8>;
expected_typeB = GGML_TYPE_Q8_K64;
break;
+ case GGML_TYPE_IQ2_BN_R4:
+ assert (ne00 % QK_IQ1BN == 0);
+ mm.funcs[0] = mul_mat_iq2_bn_r4_q8_k16<1>;
+ mm.funcs[1] = mul_mat_iq2_bn_r4_q8_k16<2>;
+ mm.funcs[2] = mul_mat_iq2_bn_r4_q8_k16<3>;
+ mm.funcs[3] = mul_mat_iq2_bn_r4_q8_k16<4>;
+ mm.funcs[4] = mul_mat_iq2_bn_r4_q8_k16<5>;
+ mm.funcs[5] = mul_mat_iq2_bn_r4_q8_k16<6>;
+//#ifdef HAVE_FANCY_SIMD
+ mm.funcs[6] = mul_mat_iq2_bn_r4_q8_k16<7>;
+ mm.funcs[7] = mul_mat_iq2_bn_r4_q8_k16<8>;
+//#endif
+ expected_typeB = GGML_TYPE_Q8_K16;
+ break;
case GGML_TYPE_Q4_0:
assert (ne00 % QK4_0 == 0);
MulMat::set_functions<Q4_0_1_Unpacker>(mm);
@@ -7171,6 +7444,135 @@ static void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const DataIn
}
}
+template <int nrc> struct Q8_16 {
+
+ constexpr static int nrc_y = nrc;
+
+ Q8_16(const DataInfo& info) {
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto ptr = (const float *)info.src1_row(iy);
+ std::memcpy(d + 5*iy, ptr, 5*sizeof(float));
+ y[iy] = (const int8_t *)(ptr + 5);
+ }
+ }
+
+ inline int8x16x4_t load_quants(int iy, int i) const { return vld1q_s8_x4(y[iy] + 64*i); }
+ inline int8x16x2_t load_quants_32(int iy, int i) const { return vld1q_s8_x2(y[iy] + 32*i); }
+ inline float scale(int iy, int k) const { return d[5*iy+k]; }
+ inline float sum_row(int iy) const { return d[5*iy + 4]; }
+ inline float32x4_t scale(int iy) const { return vld1q_f32(d + 5*iy); }
+
+ float d[5*nrc_y];
+ const int8_t * y[nrc_y];
+};
+
+template <int nrc_y>
+static IQK_NOINLINE void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+ if (nrc_x%4) {
+ printf("%s: %d is not a multiple of 4\n", __func__, nrc_x);
+ GGML_ABORT("fatal error");
+ }
+ Q8_16<nrc_y> q8(info);
+ auto m3 = vdupq_n_u8(0x3);
+ int nb = n / QK_IQ1BN;
+ if constexpr (nrc_y == 1) {
+ auto mc = vdupq_n_u8(0xc);
+ int32x4_t acc[8];
+ for (int ix = 0; ix < nrc_x; ix += 4) {
+ for (int k = 0; k < 8; ++k) acc[k] = vdupq_n_s32(0);
+ const float * dptr = (const float *)((const char *)vx + ix*bx);
+ auto dl = vld1q_f32(dptr);
+ const uint8_t * iq2 = (const uint8_t *)(dptr + 4);
+ for (int ib = 0; ib < nb; ++ib) {
+ auto y = q8.load_quants(0, ib);
+ for (int j = 0; j < 4; ++j) {
+ auto bits1 = vld1q_u8(iq2 + 64*ib + 16*j);
+ auto bits2 = vshrq_n_u8(bits1, 4);
+ acc[2*j+0] = vdotq_laneq_s32(acc[2*j+0], vandq_u8(bits1, m3), y.val[j], 0);
+ acc[2*j+1] = vdotq_laneq_s32(acc[2*j+1], vandq_u8(bits1, mc), y.val[j], 1);
+ acc[2*j+0] = vdotq_laneq_s32(acc[2*j+0], vandq_u8(bits2, m3), y.val[j], 2);
+ acc[2*j+1] = vdotq_laneq_s32(acc[2*j+1], vandq_u8(bits2, mc), y.val[j], 3);
+ }
+ }
+ auto dy = vmulq_f32(dl, vdupq_n_f32(q8.scale(0, 0)));
+ auto sumf1 = vmulq_f32( vcvtq_f32_s32(acc[0]), dy);
+ auto sumf2 = vmulq_f32( vcvtq_f32_s32(acc[1]), dy);
+ dy = vmulq_f32(dl, vdupq_n_f32(q8.scale(0, 1)));
+ sumf1 = vfmaq_f32(sumf1, vcvtq_f32_s32(acc[2]), dy);
+ sumf2 = vfmaq_f32(sumf2, vcvtq_f32_s32(acc[3]), dy);
+ dy = vmulq_f32(dl, vdupq_n_f32(q8.scale(0, 2)));
+ sumf1 = vfmaq_f32(sumf1, vcvtq_f32_s32(acc[4]), dy);
+ sumf2 = vfmaq_f32(sumf2, vcvtq_f32_s32(acc[5]), dy);
+ dy = vmulq_f32(dl, vdupq_n_f32(q8.scale(0, 3)));
+ sumf1 = vfmaq_f32(sumf1, vcvtq_f32_s32(acc[6]), dy);
+ sumf2 = vfmaq_f32(sumf2, vcvtq_f32_s32(acc[7]), dy);
+ auto sumf = vfmaq_f32(sumf1, vdupq_n_f32(0.25f), sumf2);
+ sumf = vfmaq_f32(sumf, dl, vdupq_n_f32(-q8.sum_row(0)));
+ info.store(ix, 0, sumf);
+ }
+ } else {
+ int32x4_t acc[4*nrc_y] = {};
+ uint8x16_t qx[8];
+ for (int ix = 0; ix < nrc_x; ix += 4) {
+ const float * dptr = (const float *)((const char *)vx + ix*bx);
+ auto dl = vld1q_f32(dptr);
+ const uint8_t * iq2 = (const uint8_t *)(dptr + 4);
+ for (int ib = 0; ib < nb; ++ib) {
+ auto bits = vld1q_u8_x2(iq2 + 64*ib);
+ qx[0] = vandq_u8(bits.val[0], m3);
+ qx[1] = vandq_u8(vshrq_n_u8(bits.val[0], 2), m3);
+ qx[2] = vandq_u8(vshrq_n_u8(bits.val[0], 4), m3);
+ qx[3] = vshrq_n_u8(bits.val[0], 6);
+ qx[4] = vandq_u8(bits.val[1], m3);
+ qx[5] = vandq_u8(vshrq_n_u8(bits.val[1], 2), m3);
+ qx[6] = vandq_u8(vshrq_n_u8(bits.val[1], 4), m3);
+ qx[7] = vshrq_n_u8(bits.val[1], 6);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = q8.load_quants_32(iy, 2*ib+0);
+ acc[4*iy + 0] = vdotq_laneq_s32(acc[4*iy + 0], qx[0], y.val[0], 0);
+ acc[4*iy + 0] = vdotq_laneq_s32(acc[4*iy + 0], qx[1], y.val[0], 1);
+ acc[4*iy + 0] = vdotq_laneq_s32(acc[4*iy + 0], qx[2], y.val[0], 2);
+ acc[4*iy + 0] = vdotq_laneq_s32(acc[4*iy + 0], qx[3], y.val[0], 3);
+ acc[4*iy + 1] = vdotq_laneq_s32(acc[4*iy + 1], qx[4], y.val[1], 0);
+ acc[4*iy + 1] = vdotq_laneq_s32(acc[4*iy + 1], qx[5], y.val[1], 1);
+ acc[4*iy + 1] = vdotq_laneq_s32(acc[4*iy + 1], qx[6], y.val[1], 2);
+ acc[4*iy + 1] = vdotq_laneq_s32(acc[4*iy + 1], qx[7], y.val[1], 3);
+ }
+ bits = vld1q_u8_x2(iq2 + 64*ib + 32);
+ qx[0] = vandq_u8(bits.val[0], m3);
+ qx[1] = vandq_u8(vshrq_n_u8(bits.val[0], 2), m3);
+ qx[2] = vandq_u8(vshrq_n_u8(bits.val[0], 4), m3);
+ qx[3] = vshrq_n_u8(bits.val[0], 6);
+ qx[4] = vandq_u8(bits.val[1], m3);
+ qx[5] = vandq_u8(vshrq_n_u8(bits.val[1], 2), m3);
+ qx[6] = vandq_u8(vshrq_n_u8(bits.val[1], 4), m3);
+ qx[7] = vshrq_n_u8(bits.val[1], 6);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = q8.load_quants_32(iy, 2*ib+1);
+ acc[4*iy + 2] = vdotq_laneq_s32(acc[4*iy + 2], qx[0], y.val[0], 0);
+ acc[4*iy + 2] = vdotq_laneq_s32(acc[4*iy + 2], qx[1], y.val[0], 1);
+ acc[4*iy + 2] = vdotq_laneq_s32(acc[4*iy + 2], qx[2], y.val[0], 2);
+ acc[4*iy + 2] = vdotq_laneq_s32(acc[4*iy + 2], qx[3], y.val[0], 3);
+ acc[4*iy + 3] = vdotq_laneq_s32(acc[4*iy + 3], qx[4], y.val[1], 0);
+ acc[4*iy + 3] = vdotq_laneq_s32(acc[4*iy + 3], qx[5], y.val[1], 1);
+ acc[4*iy + 3] = vdotq_laneq_s32(acc[4*iy + 3], qx[6], y.val[1], 2);
+ acc[4*iy + 3] = vdotq_laneq_s32(acc[4*iy + 3], qx[7], y.val[1], 3);
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto dy = q8.scale(iy);
+ float32x4_t sumf = vmulq_f32(vcvtq_f32_s32(acc[4*iy+0]), vmulq_laneq_f32(dl, dy, 0));
+ sumf = vfmaq_f32(sumf, vcvtq_f32_s32(acc[4*iy+1]), vmulq_laneq_f32(dl, dy, 1));
+ sumf = vfmaq_f32(sumf, vcvtq_f32_s32(acc[4*iy+2]), vmulq_laneq_f32(dl, dy, 2));
+ sumf = vfmaq_f32(sumf, vcvtq_f32_s32(acc[4*iy+3]), vmulq_laneq_f32(dl, dy, 3));
+ sumf = vfmaq_f32(sumf, dl, vdupq_n_f32(-q8.sum_row(iy)));
+ info.store(ix, iy, sumf);
+ acc[4*iy+0] = acc[4*iy+1] = acc[4*iy+2] = acc[4*iy+3] = vdupq_n_s32(0);
+ }
+ }
+ }
+}
+
template <int nrc_y>
static void mul_mat_iq2bn_q8_K64(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
const int nb = n / QK_IQ1BN;
@@ -7716,6 +8118,17 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) {
m.funcs[7] = mul_mat_iq2bn_q8_K64<8>;
expected_Btype = GGML_TYPE_Q8_K64;
break;
+ case GGML_TYPE_IQ2_BN_R4:
+ m.funcs[0] = mul_mat_iq2_bn_r4_q8_k16<1>;
+ m.funcs[1] = mul_mat_iq2_bn_r4_q8_k16<2>;
+ m.funcs[2] = mul_mat_iq2_bn_r4_q8_k16<3>;
+ m.funcs[3] = mul_mat_iq2_bn_r4_q8_k16<4>;
+ m.funcs[4] = mul_mat_iq2_bn_r4_q8_k16<5>;
+ //m.funcs[5] = mul_mat_iq2_bn_r4_q8_k16<6>;
+ //m.funcs[6] = mul_mat_iq2_bn_r4_q8_k16<7>;
+ //m.funcs[7] = mul_mat_iq2_bn_r4_q8_k16<8>;
+ expected_Btype = GGML_TYPE_Q8_K16;
+ break;
case GGML_TYPE_Q4_0:
MulMat::set_functions<DequantizerQ40>(m);
expected_Btype = GGML_TYPE_Q8_0;
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index acef04db..32fe92ef 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -362,7 +362,7 @@ void ggml_vec_dot_iq1_bn_q8_K64(int n, float * s, size_t bs, const void * vx, si
*s = d8[0] * (sumi[0] + sumi[1]) + d8[1] * (sumi[2] + sumi[3]) + d8[2] * (sumi[4] + sumi[5]) + d8[3] * (sumi[6] + sumi[7]);
}
-void ggml_vec_dot_iq2_bn_q8_K64(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+void vec_dot_iq2_bn_q8_K64(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
GGML_ASSERT(nrc == 1);
GGML_UNUSED(bs);
@@ -520,6 +520,136 @@ void quantize_row_q8_K64(const float * x, void * y, int64_t k) {
quantize_row_q8_K64_ref(x, (block_q8_K64 *)y, k);
}
+#ifdef __AVX2__
+namespace {
+inline float hsum_float_4(__m128 x) {
+ x = _mm_add_ps(x, _mm_movehl_ps(x, x));
+ x = _mm_add_ss(x, _mm_movehdup_ps(x));
+ return _mm_cvtss_f32(x);
+}
+inline float hsum_float_8(__m256 x) {
+ return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
+}
+inline int hsum_i32_8(const __m256i a) {
+ const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+ const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+ const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+ const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+ return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+inline float hmax_f32_8(__m256 x) {
+ __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
+ max4 = _mm_max_ps( max4, _mm_movehl_ps(max4, max4));
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4));
+ return _mm_cvtss_f32(max4);
+}
+}
+#endif
+
+void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) {
+ float * dptr = (float *)vy;
+ int8_t * qy = (int8_t *)(dptr + 5);
+ int n64 = nk / 64;
+#ifdef __AVX2__
+ __m256 sign_bit = _mm256_set1_ps(-0.f);
+ __m256 vmax[4] = {};
+ __m256 vsum[4] = {};
+ for (int i64 = 0; i64 < n64; ++i64) {
+ for (int k = 0; k < 4; ++k) {
+ auto v1 = _mm256_loadu_ps(x + 64*i64 + 16*k + 0);
+ auto v2 = _mm256_loadu_ps(x + 64*i64 + 16*k + 8);
+ vsum[k] = _mm256_add_ps(vsum[k], _mm256_add_ps(v1, v2));
+ v1 = _mm256_andnot_ps(sign_bit, v1);
+ v2 = _mm256_andnot_ps(sign_bit, v2);
+ vmax[k] = _mm256_max_ps(vmax[k], _mm256_max_ps(v1, v2));
+ }
+ }
+ __m256 sum = _mm256_add_ps(_mm256_add_ps(vsum[0], vsum[1]), _mm256_add_ps(vsum[2], vsum[3]));
+ dptr[4] = hsum_float_8(sum);
+ for (int k = 0; k < 4; ++k) {
+ float max = hmax_f32_8(vmax[k]);
+ dptr[k] = max/127;
+ vmax[k] = _mm256_set1_ps(dptr[k] > 0 ? 1/dptr[k] : 0.f);
+ }
+ __m256i ival[8];
+ const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+ for (int i64 = 0; i64 < n64; ++i64) {
+ for (int k = 0; k < 4; ++k) {
+ __m256 v0 = _mm256_mul_ps(vmax[k], _mm256_loadu_ps(x + 64*i64 + 16*k + 0));
+ __m256 v1 = _mm256_mul_ps(vmax[k], _mm256_loadu_ps(x + 64*i64 + 16*k + 8));
+ v0 = _mm256_round_ps(v0, _MM_ROUND_NEAREST);
+ v1 = _mm256_round_ps(v1, _MM_ROUND_NEAREST);
+ ival[2*k+0] = _mm256_cvtps_epi32(v0);
+ ival[2*k+1] = _mm256_cvtps_epi32(v1);
+ }
+ for (int k = 0; k < 2; ++k) {
+ auto i0 = _mm256_packs_epi32(ival[4*k+0], ival[4*k+1]);
+ auto i1 = _mm256_packs_epi32(ival[4*k+2], ival[4*k+3]);
+ i0 = _mm256_packs_epi16(i0, i1);
+ i0 = _mm256_permutevar8x32_epi32(i0, perm);
+ _mm256_storeu_si256((__m256i *)qy, i0);
+ qy += 32;
+ }
+ }
+#elif defined __ARM_NEON
+ static const uint8_t k_shuffle[16] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
+ auto shuffle = vld1q_u8(k_shuffle);
+ float32x4_t vmax[4] = {};
+ float32x4_t vsum[4] = {};
+ for (int i64 = 0; i64 < n64; ++i64) {
+ for (int k = 0; k < 4; ++k) {
+ auto v = vld1q_f32_x4(x + 64*i64 + 16*k);
+ vsum[k] = vaddq_f32(vsum[k], vaddq_f32(v.val[0], v.val[1]));
+ vsum[k] = vaddq_f32(vsum[k], vaddq_f32(v.val[2], v.val[3]));
+ vmax[k] = vmaxq_f32(vmax[k], vmaxq_f32(vabsq_f32(v.val[0]), vabsq_f32(v.val[1])));
+ vmax[k] = vmaxq_f32(vmax[k], vmaxq_f32(vabsq_f32(v.val[2]), vabsq_f32(v.val[3])));
+ }
+ }
+ dptr[4] = vaddvq_f32(vaddq_f32(vaddq_f32(vsum[0], vsum[1]), vaddq_f32(vsum[2], vsum[3])));
+ for (int k = 0; k < 4; ++k) {
+ float max = vmaxvq_f32(vmax[k]);
+ dptr[k] = max/127;
+ vmax[k] = vdupq_n_f32(dptr[k] > 0 ? 1/dptr[k] : 0.f);
+ }
+ int8x16x4_t q;
+ for (int i64 = 0; i64 < n64; ++i64) {
+ for (int k = 0; k < 4; ++k) {
+ auto v = vld1q_f32_x4(x + 64*i64 + 16*k);
+ for (int j = 0; j < 4; ++j) {
+ q.val[j] = vreinterpretq_s8_s32(vcvtnq_s32_f32(vmulq_f32(vmax[k], v.val[j])));
+ }
+ auto qi = vqtbl4q_s8(q, shuffle);
+ vst1q_s8(qy, qi);
+ qy += 16;
+ }
+ }
+#else
+ float amax[4] = {0.f, 0.f, 0.f, 0.f};
+ for (int i64 = 0; i64 < n64; ++i64) {
+ for (int k = 0; k < 4; ++k) {
+ for (int j = 0; j < 16; ++j) {
+ float ax = std::abs(x[64*i64 + 16*k + j]);
+ amax[k] = std::max(amax[k], ax);
+ }
+ }
+ }
+ for (int k = 0; k < 4; ++k) {
+ dptr[k] = amax[k]/127;
+ amax[k] = dptr[k] > 0 ? 1/dptr[k] : 0.f;
+ }
+ double sumf = 0;
+ for (int i64 = 0; i64 < n64; ++i64) {
+ for (int k = 0; k < 4; ++k) {
+ for (int j = 0; j < 16; ++j) {
+ sumf += x[64*i64 + 16*k + j];
+ qy[64*i64 + 16*k + j] = nearest_int(amax[k]*x[64*i64 + 16*k + j]);
+ }
+ }
+ }
+ dptr[4] = sumf;
+#endif
+}
+
//
// ============================================== iq2_K
//
@@ -2339,23 +2469,6 @@ size_t quantize_iq6_k(const float * src, void * dst, int64_t nrows, int64_t n_pe
return nrows * nblock * sizeof(block_iq6_k);
}
-#ifdef __AVX2__
-namespace {
-inline int hsum_i32_8(const __m256i a) {
- const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
- const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
- const __m128i sum64 = _mm_add_epi32(hi64, sum128);
- const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
- return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-inline float hmax_f32_8(__m256 x) {
- __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
- max4 = _mm_max_ps( max4, _mm_movehl_ps(max4, max4));
- max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4));
- return _mm_cvtss_f32(max4);
-}
-}
-#endif
void iqk_quantize_row_q8_K(const float * x, void * vy, int64_t k) {
assert(k % QK_K == 0);
@@ -3680,3 +3793,126 @@ void vec_dot_iq4_xs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
GGML_UNUSED(bx);
GGML_UNUSED(by);
}
+
+//
+// ========================================= iq2_bn_r4
+//
+void quantize_row_iq2_bn_r4_ref(const float * x, block_iq2_bn * y, int64_t k) {
+ quantize_iq2_bn_r4(x, (void *)y, 4, k/4, nullptr);
+}
+
+void quantize_row_iq2_bn_r4(const float * x, void * y, int64_t k) {
+ quantize_iq2_bn_r4(x, y, 4, k/4, nullptr);
+}
+
+namespace {
+void repack_iq2_bn(int nrows, int n_per_row, const char * x, char * y) {
+ GGML_ASSERT(nrows%4 == 0);
+ GGML_ASSERT(n_per_row%QK_IQ1BN == 0);
+ int nblock = n_per_row/QK_IQ1BN;
+ auto row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row);
+ const uint8_t * x4[4];
+ for (int row = 0; row < nrows; row += 4) {
+ float * dr4 = (float *)(y + 4*row*row_size);
+ for (int k = 0; k < 4; ++k) {
+ const float * dptr = (const float *)(x + (row + k)*row_size);
+ dr4[k] = *dptr;
+ x4[k] = (const uint8_t *)(dptr + 1);
+ }
+ uint8_t * y4 = (uint8_t *)(dr4 + 4);
+ //std::memset(y4, 0, n_per_row);
+ for (int ib = 0; ib < nblock; ++ib) {
+ // 0...3 from rows 0...3 go to 1st 2 bits of 0...15
+ // 16..19 from rows 0...3 go to 1st 2 bits of 16...31
+ // 32..35 from rows 0...3 go to 1st 2 bits of 32...47
+ // 48..51 from rows 0...3 go to 1st 2 bits of 48...63
+ // 4...7 from rows 0...3 go to 2nd 2 bits of 0...15
+ // 20..23 from rows 0...3 go to 2nd 2 bits of 16...31
+ // 36..39 from rows 0...3 go to 2nd 2 bits of 32...47
+ // 52..55 from rows 0...3 go to 2nd 2 bits of 48...63
+ // 8..11 from rows 0...3 go to 3rd 2 bits of 0...15
+ // 24..27 from rows 0...3 go to 3rd 2 bits of 16...31
+ // 40..43 from rows 0...3 go to 3rd 2 bits of 32...47
+ // 56..59 from rows 0...3 go to 3rd 2 bits of 48...63
+ // 12..15 from rows 0...3 go to 4th 2 bits of 0...15
+ // 28..31 from rows 0...3 go to 4th 2 bits of 16...31
+ // 44..47 from rows 0...3 go to 4th 2 bits of 32...47
+ // 60..63 from rows 0...3 go to 4th 2 bits of 48...63
+ for (int k = 0; k < 4; ++k) {
+ for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) {
+ y4[64*ib + 4*k + i + 16*l] = (((x4[k][16*ib + i + 0] >> 2*l) & 3) << 0) |
+ (((x4[k][16*ib + i + 4] >> 2*l) & 3) << 2) |
+ (((x4[k][16*ib + i + 8] >> 2*l) & 3) << 4) |
+ (((x4[k][16*ib + i + 12] >> 2*l) & 3) << 6);
+ //y4[64*ib + 4*k + i + 0] |= (x4[k][16*ib + i] >> 0) & 3;
+ //y4[64*ib + 4*k + i + 16] |= (x4[k][16*ib + i] >> 2) & 3;
+ //y4[64*ib + 4*k + i + 32] |= (x4[k][16*ib + i] >> 4) & 3;
+ //y4[64*ib + 4*k + i + 48] |= (x4[k][16*ib + i] >> 6) & 3;
+ //y4[64*ib + 4*k + i + 0] |= ((x4[k][16*ib + i + 4] >> 0) & 3) << 2;
+ //y4[64*ib + 4*k + i + 16] |= ((x4[k][16*ib + i + 4] >> 2) & 3) << 2;
+ //y4[64*ib + 4*k + i + 32] |= ((x4[k][16*ib + i + 4] >> 4) & 3) << 2;
+ //y4[64*ib + 4*k + i + 48] |= ((x4[k][16*ib + i + 4] >> 6) & 3) << 2;
+ //y4[64*ib + 4*k + i + 0] |= ((x4[k][16*ib + i + 8] >> 0) & 3) << 4;
+ //y4[64*ib + 4*k + i + 16] |= ((x4[k][16*ib + i + 8] >> 2) & 3) << 4;
+ //y4[64*ib + 4*k + i + 32] |= ((x4[k][16*ib + i + 8] >> 4) & 3) << 4;
+ //y4[64*ib + 4*k + i + 48] |= ((x4[k][16*ib + i + 8] >> 6) & 3) << 4;
+ //y4[64*ib + 4*k + i + 0] |= ((x4[k][16*ib + i + 12] >> 0) & 3) << 6;
+ //y4[64*ib + 4*k + i + 16] |= ((x4[k][16*ib + i + 12] >> 2) & 3) << 6;
+ //y4[64*ib + 4*k + i + 32] |= ((x4[k][16*ib + i + 12] >> 4) & 3) << 6;
+ //y4[64*ib + 4*k + i + 48] |= ((x4[k][16*ib + i + 12] >> 6) & 3) << 6;
+ }
+ }
+ }
+ }
+}
+}
+
+size_t quantize_iq2_bn_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+ GGML_ASSERT(nrows%4 == 0);
+ GGML_ASSERT(n_per_row%QK_IQ1BN == 0);
+ char * qcur = (char *)dst;
+ auto row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row);
+ std::vector<char> qtmp(4*row_size);
+ for (int row = 0; row < nrows; row += 4) {
+ quantize_iq2_bn(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
+ repack_iq2_bn(4, n_per_row, qtmp.data(), qcur);
+ qcur += 4*row_size;
+ src += 4*n_per_row;
+ }
+ return nrows*row_size;
+}
+
+void dequantize_row_iq2_bn_r4(const block_iq2_bn * x, float * y, int64_t k) {
+ static_assert(QK_IQ1BN == 64);
+ auto n_per_row = k/4;
+ float * y4[4] = {y, y + n_per_row, y + 2*n_per_row, y + 3*n_per_row};
+ const float * d4 = (const float *)x;
+ const uint8_t * qx = (const uint8_t *)(d4 + 4);
+ int nblock = n_per_row/QK_IQ1BN;
+ for (int ib = 0; ib < nblock; ++ib) {
+ for (int k = 0; k < 4; ++k) {
+ for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) {
+ uint8_t q = qx[4*k + i + 16*l];
+ y4[k][64*ib + 16*l + i + 0] = d4[k] * (((q >> 0) & 3) - 1);
+ y4[k][64*ib + 16*l + i + 4] = d4[k] * (((q >> 2) & 3) - 1);
+ y4[k][64*ib + 16*l + i + 8] = d4[k] * (((q >> 4) & 3) - 1);
+ y4[k][64*ib + 16*l + i + 12] = d4[k] * (((q >> 6) & 3) - 1);
+ }
+ }
+ qx += 64;
+ }
+}
+
+void vec_dot_iq2_bn_r4_q8_K64(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+#if GGML_USE_IQK_MULMAT
+ if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_BN_R4, vx, 0, GGML_TYPE_Q8_K64, vy, 0, s, 0, 0, 1)) {
+ return;
+ }
+#endif
+ GGML_ASSERT(n%QK4_NL == 0);
+ GGML_ASSERT(nrc == 1);
+ GGML_UNUSED(bs);
+ GGML_UNUSED(bx);
+ GGML_UNUSED(by);
+}
+
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index ad2294c5..fb436e57 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -99,6 +99,22 @@ size_t quantize_iq4_xs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT
void dequantize_row_iq4_xs_r4(const block_iq4_xs_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void vec_dot_iq4_xs_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void quantize_row_iq2_bn_ref (const float * GGML_RESTRICT x, block_iq2_bn * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq2_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq2_bn (const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+size_t quantize_iq2_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+void vec_dot_iq2_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void quantize_row_iq2_bn_r4_ref (const float * GGML_RESTRICT x, block_iq2_bn * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq2_bn_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq2_bn_r4(const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+size_t quantize_iq2_bn_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+void vec_dot_iq2_bn_r4_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
#ifdef __cplusplus
}
#endif