summaryrefslogtreecommitdiff
path: root/ggml
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-02-06 14:08:52 +0200
committerGitHub <noreply@github.com>2025-02-06 14:08:52 +0200
commit7f61b3068e18728e5e7e2b95546ff03dd2fd41ac (patch)
treef175a942a6ebd2d2d8b08c46fa71d9f6fbad50e7 /ggml
parenta6f9f2ec9af92b5a13f035db054aac2fd2efaee7 (diff)
IQ1_M_R4: better 1.75 bpw quants (#187)
* iq1_m_r4: basics (quantize/dequantize) * iq1_m_r4: Zen4 gemm * iq1_m_r4: neon gemm * iq1_m_r4: switch to q8_0_x4 also on AVX2/Zen4 With the deltas being per group of 8, we cannot make use of the q8 sums stored in q8_1, so we get a tiny gain by using q8_0_x4. * iq1_m_r4: rename mul_mat_iq1_m_r4_q8_1 to mul_mat_iq1_m_r4_q8_0 --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml')
-rw-r--r--ggml/include/ggml.h2
-rw-r--r--ggml/src/ggml-common.h8
-rw-r--r--ggml/src/ggml-quants.c403
-rw-r--r--ggml/src/ggml-quants.h4
-rw-r--r--ggml/src/ggml.c27
-rw-r--r--ggml/src/iqk/iqk_mul_mat.cpp197
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp117
-rw-r--r--ggml/src/iqk/iqk_quantize.h6
8 files changed, 538 insertions, 226 deletions
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 9668dc32..77ac33a9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -432,6 +432,7 @@ extern "C" {
GGML_TYPE_IQ3_S_R4 = 221,
GGML_TYPE_IQ2_S_R4 = 222,
GGML_TYPE_IQ4_XS_R4 = 223,
+ GGML_TYPE_IQ1_M_R4 = 229,
GGML_TYPE_BF16_R16 = 230,
GGML_TYPE_Q6_0_R4 = 233,
GGML_TYPE_IQ2_BN_R4 = 335,
@@ -516,6 +517,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ3_S_R4 = 220, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_S_R4 = 221, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors
+ GGML_FTYPE_MOSTLY_IQ1_M_R4 = 223, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16_R16 = 224, // except 1d tensors
GGML_FTYPE_MOSTLY_Q6_0_R4 = 227, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_BN_R4 = 329, // except 1d tensors
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 14813161..679353be 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -499,6 +499,14 @@ typedef struct {
} block_iq1_m;
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
+// 1.75 bpw - blocks of 32 with 4 interleaved rows = 128 quants
+typedef struct {
+ uint8_t qs[16]; // grid index, low 8 bits
+ uint8_t qh[ 8]; // grid index, high 3 bits + grid shift bits (for two groups of 8)
+ uint8_t scales[4]; // 4-bit block scales
+} block_iq1_m_r4;
+static_assert(sizeof(block_iq1_m_r4) == 28, "wrong iq1_m_r4 block size/padding");
+
//
// Bitnet and TriLM - implemented as 1.625 bpw
//
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 3c4711f3..d32a583f 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -14145,85 +14145,6 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
int best_shift;
iq1s_process_1block(block_size, xb, weight, L, &scales[ib], index, &best_shift, pairs, sumx, sumw);
-// float max = fabsf(xb[0]);
-// for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
-// if (max < GROUP_MAX_EPS_IQ1_S) {
-// scales[ib] = 0;
-// memset(L, 1, block_size);
-// continue;
-// }
-// // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
-// // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
-// // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
-// // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
-// // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
-// // for each possible and score for each split.
-// for (int j = 0; j < block_size; ++j) {
-// pairs[2*j] = xb[j];
-// idx[2*j] = j;
-// }
-// qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
-// {
-// sumx[0] = sumw[0] = 0;
-// for (int j = 0; j < block_size; ++j) {
-// int i = idx[2*j];
-// sumx[j+1] = sumx[j] + weight[i]*xb[i];
-// sumw[j+1] = sumw[j] + weight[i];
-// }
-// }
-// float best_score = -FLT_MIN, scale = max;
-// int besti1 = -1, besti2 = -1, best_shift = 0;
-// for (int i1 = 0; i1 <= block_size; ++i1) {
-// for (int i2 = i1; i2 <= block_size; ++i2) {
-// float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
-// float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
-// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
-// scale = sumqx/sumq2; best_score = scale*sumqx;
-// besti1 = i1; besti2 = i2; best_shift = 1;
-// }
-// sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
-// sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
-// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
-// scale = sumqx/sumq2; best_score = scale*sumqx;
-// besti1 = i1; besti2 = i2; best_shift = -1;
-// }
-// }
-// }
-// GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
-// for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
-// for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
-// for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
-// if (scale < 0) {
-// for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
-// scale = -scale; best_shift = -best_shift;
-// }
-// bool all_on_grid = true;
-// const float * xx = best_shift == 1 ? x_p : x_m;
-// for (int k = 0; k < block_size/8; ++k) {
-// uint16_t u = 0;
-// for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
-// int grid_index = kmap_q2xs[u];
-// if (grid_index < 0) {
-// all_on_grid = false;
-// const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-// grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
-// GGML_ASSERT(grid_index >= 0);
-// }
-// index[k] = grid_index;
-// }
-// if (!all_on_grid) {
-// float sumqx = 0, sumq2 = 0;
-// for (int k = 0; k < block_size/8; ++k) {
-// const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
-// for (int j = 0; j < 8; ++j) {
-// float w = weight[8*k + j];
-// float q = xx[(pg[j] - 1)/2];
-// sumqx += w*q*xb[8*k+j];
-// sumq2 += w*q*q;
-// }
-// }
-// if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
-// }
uint16_t h = 0;
for (int k = 0; k < block_size/8; ++k) {
y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
@@ -14232,10 +14153,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
y[ibl].qh[ib] = h;
GGML_ASSERT(scales[ib] >= 0);
max_scale = MAX(max_scale, scales[ib]);
- //GGML_ASSERT(scale >= 0);
- //scales[ib] = scale;
shifts[ib] = best_shift;
- //max_scale = MAX(max_scale, scale);
}
if (!max_scale) {
@@ -14287,6 +14205,166 @@ void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
quantize_row_iq1_s_ref(x, (block_iq1_s *)y, k);
}
+void iq1m_process_1block(const float * xb, const float * weight, int8_t * L, float * the_scale, uint16_t * the_index, int * the_shift,
+ float * pairs) {
+
+ const int block_size = IQ1M_BLOCK_SIZE;
+
+ const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
+ const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
+
+ float sumqx[4], sumq2[4];
+
+ const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
+
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
+ const int * kmap_q2xs = iq2_data[gindex].map;
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+
+ // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
+ // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
+ // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
+ // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
+ // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
+ // for each possible and score for each split.
+ int * idx = (int *)(pairs + 1);
+ for (int j = 0; j < block_size; ++j) {
+ pairs[2*j] = xb[j];
+ idx[2*j] = j;
+ }
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
+ float best_score = -FLT_MIN, scale = 0.f;
+ int besti1 = -1, besti2 = -1, best_k = -1;
+ // 0: +, +
+ // 1: +, -
+ // 2: -, +
+ // 3: -, -
+ for (int i1 = 0; i1 <= block_size; ++i1) {
+ for (int i2 = i1; i2 <= block_size; ++i2) {
+ memset(sumqx, 0, 4*sizeof(float));
+ memset(sumq2, 0, 4*sizeof(float));
+ for (int j = 0; j < i1; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
+ sumqx[1] += weight[i]*x_p[0]*xb[i];
+ sumqx[2] += weight[i]*x_m[0]*xb[i];
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
+ sumq2[1] += weight[i]*x_p[0]*x_p[0];
+ sumq2[2] += weight[i]*x_m[0]*x_m[0];
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
+ } else {
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
+ sumqx[2] += weight[i]*x_p[0]*xb[i];
+ sumqx[1] += weight[i]*x_m[0]*xb[i];
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
+ sumq2[2] += weight[i]*x_p[0]*x_p[0];
+ sumq2[1] += weight[i]*x_m[0]*x_m[0];
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
+ }
+ }
+ for (int j = i1; j < i2; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
+ sumqx[1] += weight[i]*x_p[1]*xb[i];
+ sumqx[2] += weight[i]*x_m[1]*xb[i];
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
+ sumq2[1] += weight[i]*x_p[1]*x_p[1];
+ sumq2[2] += weight[i]*x_m[1]*x_m[1];
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
+ } else {
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
+ sumqx[2] += weight[i]*x_p[1]*xb[i];
+ sumqx[1] += weight[i]*x_m[1]*xb[i];
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
+ sumq2[2] += weight[i]*x_p[1]*x_p[1];
+ sumq2[1] += weight[i]*x_m[1]*x_m[1];
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
+ }
+ }
+ for (int j = i2; j < block_size; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
+ sumqx[1] += weight[i]*x_p[2]*xb[i];
+ sumqx[2] += weight[i]*x_m[2]*xb[i];
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
+ sumq2[1] += weight[i]*x_p[2]*x_p[2];
+ sumq2[2] += weight[i]*x_m[2]*x_m[2];
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
+ } else {
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
+ sumqx[2] += weight[i]*x_p[2]*xb[i];
+ sumqx[1] += weight[i]*x_m[2]*xb[i];
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
+ sumq2[2] += weight[i]*x_p[2]*x_p[2];
+ sumq2[1] += weight[i]*x_m[2]*x_m[2];
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
+ }
+ }
+ for (int k = 0; k < 4; ++k) {
+ if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
+ scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
+ besti1 = i1; besti2 = i2; best_k = k;
+ }
+ }
+ }
+ }
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
+ for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
+ for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
+ if (scale < 0) {
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
+ scale = -scale;
+ best_k = 3 - best_k;
+ }
+ bool all_on_grid = true;
+ const float * xx;
+ for (int k = 0; k < block_size/8; ++k) {
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
+ else xx = best_k%2 == 0 ? x_p : x_m;
+ uint16_t u = 0;
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
+ int grid_index = kmap_q2xs[u];
+ if (grid_index < 0) {
+ all_on_grid = false;
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
+ GGML_ASSERT(grid_index >= 0);
+ }
+ the_index[k] = grid_index;
+ }
+ if (!all_on_grid) {
+ float sumqx_f = 0, sumq2_f = 0;
+ for (int k = 0; k < block_size/8; ++k) {
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
+ else xx = best_k%2 == 0 ? x_p : x_m;
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + the_index[k]);
+ for (int j = 0; j < 8; ++j) {
+ float w = weight[8*k + j];
+ float q = xx[(pg[j] - 1)/2];
+ sumqx_f += w*q*xb[8*k+j];
+ sumq2_f += w*q*q;
+ }
+ }
+ if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
+ }
+ *the_scale = scale;
+ *the_shift = best_k;
+}
+
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
float * scales,
float * weight,
@@ -14301,7 +14379,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
- //GGML_ASSERT(quant_weights && "missing quantization weights");
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
@@ -14317,10 +14394,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
- int * idx = (int *)(pairs + 1);
-
- float sumqx[4], sumq2[4];
-
iq1m_scale_t s;
const float * xx;
@@ -14351,147 +14424,15 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
memset(L, 1, block_size);
continue;
}
- // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
- // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
- // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
- // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
- // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
- // for each possible and score for each split.
- for (int j = 0; j < block_size; ++j) {
- pairs[2*j] = xb[j];
- idx[2*j] = j;
- }
- qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
- float best_score = -FLT_MIN, scale = max;
- int besti1 = -1, besti2 = -1, best_k = -1;
- // 0: +, +
- // 1: +, -
- // 2: -, +
- // 3: -, -
- for (int i1 = 0; i1 <= block_size; ++i1) {
- for (int i2 = i1; i2 <= block_size; ++i2) {
- memset(sumqx, 0, 4*sizeof(float));
- memset(sumq2, 0, 4*sizeof(float));
- for (int j = 0; j < i1; ++j) {
- int i = idx[2*j];
- if (i < block_size/2) {
- sumqx[0] += weight[i]*x_p[0]*xb[i];
- sumqx[1] += weight[i]*x_p[0]*xb[i];
- sumqx[2] += weight[i]*x_m[0]*xb[i];
- sumqx[3] += weight[i]*x_m[0]*xb[i];
- sumq2[0] += weight[i]*x_p[0]*x_p[0];
- sumq2[1] += weight[i]*x_p[0]*x_p[0];
- sumq2[2] += weight[i]*x_m[0]*x_m[0];
- sumq2[3] += weight[i]*x_m[0]*x_m[0];
- } else {
- sumqx[0] += weight[i]*x_p[0]*xb[i];
- sumqx[2] += weight[i]*x_p[0]*xb[i];
- sumqx[1] += weight[i]*x_m[0]*xb[i];
- sumqx[3] += weight[i]*x_m[0]*xb[i];
- sumq2[0] += weight[i]*x_p[0]*x_p[0];
- sumq2[2] += weight[i]*x_p[0]*x_p[0];
- sumq2[1] += weight[i]*x_m[0]*x_m[0];
- sumq2[3] += weight[i]*x_m[0]*x_m[0];
- }
- }
- for (int j = i1; j < i2; ++j) {
- int i = idx[2*j];
- if (i < block_size/2) {
- sumqx[0] += weight[i]*x_p[1]*xb[i];
- sumqx[1] += weight[i]*x_p[1]*xb[i];
- sumqx[2] += weight[i]*x_m[1]*xb[i];
- sumqx[3] += weight[i]*x_m[1]*xb[i];
- sumq2[0] += weight[i]*x_p[1]*x_p[1];
- sumq2[1] += weight[i]*x_p[1]*x_p[1];
- sumq2[2] += weight[i]*x_m[1]*x_m[1];
- sumq2[3] += weight[i]*x_m[1]*x_m[1];
- } else {
- sumqx[0] += weight[i]*x_p[1]*xb[i];
- sumqx[2] += weight[i]*x_p[1]*xb[i];
- sumqx[1] += weight[i]*x_m[1]*xb[i];
- sumqx[3] += weight[i]*x_m[1]*xb[i];
- sumq2[0] += weight[i]*x_p[1]*x_p[1];
- sumq2[2] += weight[i]*x_p[1]*x_p[1];
- sumq2[1] += weight[i]*x_m[1]*x_m[1];
- sumq2[3] += weight[i]*x_m[1]*x_m[1];
- }
- }
- for (int j = i2; j < block_size; ++j) {
- int i = idx[2*j];
- if (i < block_size/2) {
- sumqx[0] += weight[i]*x_p[2]*xb[i];
- sumqx[1] += weight[i]*x_p[2]*xb[i];
- sumqx[2] += weight[i]*x_m[2]*xb[i];
- sumqx[3] += weight[i]*x_m[2]*xb[i];
- sumq2[0] += weight[i]*x_p[2]*x_p[2];
- sumq2[1] += weight[i]*x_p[2]*x_p[2];
- sumq2[2] += weight[i]*x_m[2]*x_m[2];
- sumq2[3] += weight[i]*x_m[2]*x_m[2];
- } else {
- sumqx[0] += weight[i]*x_p[2]*xb[i];
- sumqx[2] += weight[i]*x_p[2]*xb[i];
- sumqx[1] += weight[i]*x_m[2]*xb[i];
- sumqx[3] += weight[i]*x_m[2]*xb[i];
- sumq2[0] += weight[i]*x_p[2]*x_p[2];
- sumq2[2] += weight[i]*x_p[2]*x_p[2];
- sumq2[1] += weight[i]*x_m[2]*x_m[2];
- sumq2[3] += weight[i]*x_m[2]*x_m[2];
- }
- }
- for (int k = 0; k < 4; ++k) {
- if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
- scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
- besti1 = i1; besti2 = i2; best_k = k;
- }
- }
- }
- }
- GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
- for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
- for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
- for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
- if (scale < 0) {
- for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
- scale = -scale;
- best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
- }
- bool all_on_grid = true;
- for (int k = 0; k < block_size/8; ++k) {
- if (k == 0) xx = best_k < 2 ? x_p : x_m;
- else xx = best_k%2 == 0 ? x_p : x_m;
- uint16_t u = 0;
- for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
- int grid_index = kmap_q2xs[u];
- if (grid_index < 0) {
- all_on_grid = false;
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
- grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
- GGML_ASSERT(grid_index >= 0);
- }
- index[k] = grid_index;
- }
- if (!all_on_grid) {
- float sumqx_f = 0, sumq2_f = 0;
- for (int k = 0; k < block_size/8; ++k) {
- if (k == 0) xx = best_k < 2 ? x_p : x_m;
- else xx = best_k%2 == 0 ? x_p : x_m;
- const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
- for (int j = 0; j < 8; ++j) {
- float w = weight[8*k + j];
- float q = xx[(pg[j] - 1)/2];
- sumqx_f += w*q*xb[8*k+j];
- sumq2_f += w*q*q;
- }
- }
- if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
- }
+
+ int best_k = -1;
+ iq1m_process_1block(xb, weight, L, &scales[ib], index, &best_k, pairs);
+
y[ibl].qs[2*ib + 0] = index[0] & 255;
y[ibl].qs[2*ib + 1] = index[1] & 255;
y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
- GGML_ASSERT(scale >= 0);
- scales[ib] = scale;
shifts[ib] = best_k;
- max_scale = MAX(max_scale, scale);
+ max_scale = MAX(max_scale, scales[ib]);
}
if (!max_scale) {
@@ -14553,6 +14494,19 @@ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t n
return nrow * nblock * sizeof(block_iq1_m);
}
+void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_m * GGML_RESTRICT y, int64_t k) {
+ int nblock = k/QK_K;
+ float qw[QK_K];
+ for (int j = 0; j < QK_K; ++j) qw[j] = 1;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);
+ }
+}
+
+void quantize_row_iq1_m (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+ quantize_row_iq1_m_ref(x, (block_iq1_m *)y, k);
+}
+
// ============================ 4-bit non-linear quants
static const int8_t iq4nl_index[241] = {
@@ -15246,6 +15200,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
case GGML_TYPE_IQ3_S_R4: break;
case GGML_TYPE_IQ2_S_R4: break;
case GGML_TYPE_IQ1_S_R4: break;
+ case GGML_TYPE_IQ1_M_R4: break;
case GGML_TYPE_Q4_0_R4: break;
case GGML_TYPE_Q5_0_R4: break;
case GGML_TYPE_Q6_0_R4: break;
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 4753f342..7c8e2110 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -43,6 +43,7 @@ void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGM
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_m * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -68,6 +69,7 @@ void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq1_m (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
// Dequantization
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -152,6 +154,8 @@ void iq3xs_free_impl(int grid_size);
void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L,
float * the_scale, uint16_t * the_index, int * the_shift, float * pairs, float * sumx, float * sumw);
+void iq1m_process_1block(const float * xb, const float * weight, int8_t * L,
+ float * the_scale, uint16_t * the_index, int * the_shift, float * pairs);
#if defined(__ARM_FEATURE_SVE)
extern int ggml_sve_cnt_b;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 64b7d3ce..4199a282 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1202,13 +1202,26 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq1_m),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
- .from_float = NULL,
- .from_float_ref = NULL,
+ .from_float = quantize_row_iq1_m,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq1_m_ref,
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.row_meta_size = 0,
},
+ [GGML_TYPE_IQ1_M_R4] = {
+ .type_name = "iq1_m_r4",
+ .blck_size = 32,
+ .type_size = sizeof(block_iq1_m_r4)/4,
+ .is_quantized = true,
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_m_r4,
+ .from_float = quantize_row_iq1_m_r4,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq1_m_r4_ref,
+ .vec_dot = vec_dot_iq1_m_r4_q8_k,
+ .vec_dot_type = GGML_TYPE_Q8_0_X4,
+ .nrows = 1,
+ .row_meta_size = 2,
+ },
[GGML_TYPE_IQ1_BN] = {
.type_name = "iq1_bn",
.blck_size = QK_IQ1BN,
@@ -4401,6 +4414,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
case GGML_FTYPE_MOSTLY_IQ2_S_R4: wtype = GGML_TYPE_IQ2_S_R4; break;
case GGML_FTYPE_MOSTLY_IQ1_S_R4: wtype = GGML_TYPE_IQ1_S_R4; break;
+ case GGML_FTYPE_MOSTLY_IQ1_M_R4: wtype = GGML_TYPE_IQ1_M_R4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
@@ -10949,6 +10963,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -11418,6 +11433,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -11584,6 +11600,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -14823,6 +14840,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -15229,6 +15247,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -15529,6 +15548,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -16158,6 +16178,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_Q8_K:
case GGML_TYPE_Q8_K64:
case GGML_TYPE_Q8_K16:
@@ -22914,6 +22935,7 @@ void ggml_quantize_init(enum ggml_type type) {
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
+ case GGML_TYPE_IQ1_M_R4:iq2xs_init_impl(GGML_TYPE_IQ1_M); break;
case GGML_TYPE_IQ1_S_R4:iq2xs_init_impl(GGML_TYPE_IQ1_S); break;
case GGML_TYPE_IQ3_XXS_R4:
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
@@ -22998,6 +23020,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ1_M_R4:result = quantize_iq1_m_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index ea8e8274..57024602 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -260,6 +260,7 @@ struct MulMat {
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ3_XXS_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_IQ3_S_R4: return 4;
case GGML_TYPE_IQ4_NL_R4:
case GGML_TYPE_Q5_0_R4:
@@ -295,6 +296,7 @@ struct MulMat {
case GGML_TYPE_IQ3_XXS_R4:
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ1_S_R4:
+ case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_IQ2_BN_R4: return 4;
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
@@ -3609,6 +3611,102 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI
}
}
+template <int nrc_y>
+static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+ GGML_ASSERT(nrc_x%4 == 0);
+ Q8<nrc_y, block_q8_0_x4> q8(info);
+ int nb = n / 32;
+ GGML_ASSERT(nb%4 == 0);
+ auto shuffle0 = _mm256_set_epi64x(0x0909090909090909, 0x0808080808080808, 0x0101010101010101, 0x0000000000000000);
+ auto step = _mm256_set1_epi8(2);
+#ifndef HAVE_FANCY_SIMD
+ auto m1 = _mm256_set1_epi16(1);
+#endif
+ __m256i qx[4];
+ __m256 acc[nrc_y] = {};
+ auto ms = _mm_set1_epi8(0x08);
+ float d8[4*nrc_y];
+ union { __m256i vec; uint16_t val[16]; } helper;
+ for (int ix= 0; ix < nrc_x; ix += 4) {
+ auto dptr = (const ggml_half *)((const char *)vx + ix*bx);
+ auto d1 = _mm_mul_ps(_mm_set1_ps(0.125f), _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)dptr)));
+ auto x = (const block_iq1_m_r4 *)(dptr + 4);
+ for (int ib = 0; ib < nb/4; ++ib) {
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ _mm_storeu_ps(d8 + 4*iy, _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib].d)));
+ }
+ for (int k = 0; k < 4; ++k) {
+ auto qh = (const uint32_t *)x[4*ib+k].qh;
+ auto idxh = _mm_set_epi32(qh[1] >> 4, qh[1], qh[0] >> 4, qh[0]);
+ auto scales4 = _mm_set1_epi32(((const uint32_t *)x[4*ib+k].scales)[0]);
+ scales4 = _mm_and_si128(_mm_srlv_epi32(scales4, _mm_set_epi32(4, 0, 4, 0)), _mm_set1_epi8(0xf));
+ scales4 = _mm_cvtepu8_epi16(scales4);
+ auto scales = MM256_SET_M128I(_mm_unpackhi_epi16(scales4, scales4), _mm_unpacklo_epi16(scales4, scales4));
+
+ auto signs128 = _mm_or_si128(_mm_cmpeq_epi8(_mm_and_si128(idxh, ms), ms), _mm_set1_epi8(1));
+ signs128 = _mm_add_epi8(_mm_set1_epi8(-8), signs128);
+ auto signs = MM256_SET_M128I(signs128, signs128);
+ auto idxl = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)x[4*ib+k].qs));
+ idxh = _mm_and_si128(idxh, _mm_set1_epi8(0x07));
+ helper.vec = _mm256_or_si256(idxl, _mm256_slli_epi16(_mm256_cvtepu8_epi16(idxh), 8));
+ qx[0] = _mm256_set_epi64x(iq1s_grid_us[helper.val[ 9]], iq1s_grid_us[helper.val[ 8]],
+ iq1s_grid_us[helper.val[ 1]], iq1s_grid_us[helper.val[ 0]]);
+ qx[1] = _mm256_set_epi64x(iq1s_grid_us[helper.val[13]], iq1s_grid_us[helper.val[12]],
+ iq1s_grid_us[helper.val[ 5]], iq1s_grid_us[helper.val[ 4]]);
+ qx[2] = _mm256_set_epi64x(iq1s_grid_us[helper.val[11]], iq1s_grid_us[helper.val[10]],
+ iq1s_grid_us[helper.val[ 3]], iq1s_grid_us[helper.val[ 2]]);
+ qx[3] = _mm256_set_epi64x(iq1s_grid_us[helper.val[15]], iq1s_grid_us[helper.val[14]],
+ iq1s_grid_us[helper.val[ 7]], iq1s_grid_us[helper.val[ 6]]);
+ qx[0] = _mm256_add_epi8(_mm256_slli_epi16(qx[0], 3), _mm256_shuffle_epi8(signs, shuffle0));
+ auto shuffle = _mm256_add_epi8(shuffle0, step);
+ qx[2] = _mm256_add_epi8(_mm256_slli_epi16(qx[2], 3), _mm256_shuffle_epi8(signs, shuffle));
+ shuffle = _mm256_add_epi8(shuffle, step);
+ qx[1] = _mm256_add_epi8(_mm256_slli_epi16(qx[1], 3), _mm256_shuffle_epi8(signs, shuffle));
+ shuffle = _mm256_add_epi8(shuffle, step);
+ qx[3] = _mm256_add_epi8(_mm256_slli_epi16(qx[3], 3), _mm256_shuffle_epi8(signs, shuffle));
+ auto s0 = _mm256_sign_epi8(qx[0], qx[0]);
+ auto s1 = _mm256_sign_epi8(qx[1], qx[1]);
+ auto s2 = _mm256_sign_epi8(qx[2], qx[2]);
+ auto s3 = _mm256_sign_epi8(qx[3], qx[3]);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ib].qs + k);
+ auto y1 = _mm256_shuffle_epi32(y, 0x44);
+ auto y2 = _mm256_shuffle_epi32(y, 0xee);
+#ifdef HAVE_FANCY_SIMD
+ // 0,0, 1,1, 0,0, 1,1 as int32_t
+ auto sumi1 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(),
+ s0, _mm256_sign_epi8(y1, qx[0])), s1, _mm256_sign_epi8(y2, qx[1]));
+ // 2,2, 3,3, 2,2, 3,3 as int32_t
+ auto sumi2 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(),
+ s2, _mm256_sign_epi8(y1, qx[2])), s3, _mm256_sign_epi8(y2, qx[3]));
+ auto sumi = _mm256_packs_epi32(sumi1, sumi2);
+#else
+ // 4 x row 0, 4 x row 1, 4 x row 0, 4 x row 1
+ auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(s0, _mm256_sign_epi8(y1, qx[0])),
+ _mm256_maddubs_epi16(s1, _mm256_sign_epi8(y2, qx[1])));
+ // 4 x row 2, 4 x row 3, 4 x row 2, 4 x row 3
+ auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(s2, _mm256_sign_epi8(y1, qx[2])),
+ _mm256_maddubs_epi16(s3, _mm256_sign_epi8(y2, qx[3])));
+ // 0,0, 1,1, 0,0, 1,1 as int32_t
+ sumi1 = _mm256_madd_epi16(m1, sumi1);
+ // 2,2, 3,3, 2,2, 3,3 as int32_t
+ sumi2 = _mm256_madd_epi16(m1, sumi2);
+ // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3 as int16_t
+ auto sumi = _mm256_packs_epi32(sumi1, sumi2);
+#endif
+ sumi = _mm256_madd_epi16(scales, sumi);
+ acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[4*iy+k]), _mm256_cvtepi32_ps(sumi), acc[iy]);
+ }
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto sumf = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
+ info.store(ix, iy, _mm_mul_ps(d1, sumf));
+ acc[iy] = _mm256_setzero_ps();
+ }
+ }
+}
+
#ifdef HAVE_FANCY_SIMD
template <int nrc_y>
static void mul_mat_q4_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
@@ -9081,6 +9179,21 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
#endif
expected_typeB = GGML_TYPE_Q8_1_X4;
break;
+ case GGML_TYPE_IQ1_M_R4:
+ assert (ne00 % QK4_NL == 0);
+ mm.funcs[0] = mul_mat_iq1_m_r4_q8_0<1>;
+ mm.funcs[1] = mul_mat_iq1_m_r4_q8_0<2>;
+ mm.funcs[2] = mul_mat_iq1_m_r4_q8_0<3>;
+ mm.funcs[3] = mul_mat_iq1_m_r4_q8_0<4>;
+ mm.funcs[4] = mul_mat_iq1_m_r4_q8_0<5>;
+ mm.funcs[5] = mul_mat_iq1_m_r4_q8_0<6>;
+ mm.funcs[6] = mul_mat_iq1_m_r4_q8_0<7>;
+ mm.funcs[7] = mul_mat_iq1_m_r4_q8_0<8>;
+#ifdef HAVE_FANCY_SIMD
+ mm.func16 = mul_mat_iq1_m_r4_q8_0<16>;
+#endif
+ expected_typeB = GGML_TYPE_Q8_0_X4;
+ break;
default:
return false;
@@ -12093,6 +12206,85 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI
}
template <int nrc_y>
+static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+ GGML_ASSERT(nrc_x%4 == 0);
+ Q8<nrc_y, block_q8_0_x4> q8(info);
+ int nb = n / 32;
+ GGML_ASSERT(nb%4 == 0);
+ int8x16_t qx[8];
+ int32x4_t acc[nrc_y] = {};
+ auto shuffle0 = uint32x4_t{0x00000000, 0x01010101, 0x02020202, 0x03030303};
+ auto step = vdupq_n_u8(4);
+ auto ms = vdupq_n_u8(0x08);
+ auto mask = vdupq_n_s8(0x18);
+ float d8[4*nrc_y];
+ for (int ix= 0; ix < nrc_x; ix += 4) {
+ auto dptr = (const ggml_half *)((const char *)vx + ix*bx);
+ auto d1 = vmulq_f32(vdupq_n_f32(0.125f), vcvt_f32_f16(vld1_f16((const float16_t *)dptr)));
+ auto x = (const block_iq1_m_r4 *)(dptr + 4);
+ for (int ib = 0; ib < nb/4; ++ib) {
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto scales = vld1_f16((const float16_t *)q8.y[iy][ib].d);
+ vst1q_f32(d8+4*iy, vcvt_f32_f16(scales));
+ }
+ for (int k = 0; k < 4; ++k) {
+ auto scales4 = vdup_n_u32(((const uint32_t *)x[4*ib+k].scales)[0]);
+ scales4 = vand_u8(vshl_u32(scales4, int32x2_t{0, -4}), vdup_n_u8(0xf));
+ auto scales16 = vmovl_u8(scales4);
+ auto scales1 = vmovl_u16(vget_low_u16(scales16));
+ auto scales2 = vmovl_u16(vget_high_u16(scales16));
+ auto qh = (const uint32_t *)x[4*ib+k].qh;
+ auto idxh = uint32x4_t{qh[0], qh[0] >> 4, qh[1], qh[1] >> 4};
+ auto signs = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(idxh, ms), ms), vdupq_n_u8(1)));
+ signs = vaddq_s8(signs, vdupq_n_s8(-8));
+ qx[0] = vreinterpretq_s8_u32(uint32x4_t{iq1s_grid_us[x[4*ib+k].qs[ 0] | ((x[4*ib+k].qh[0] << 8) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[ 1] | ((x[4*ib+k].qh[1] << 8) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[ 2] | ((x[4*ib+k].qh[2] << 8) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[ 3] | ((x[4*ib+k].qh[3] << 8) & 0x0700)]});
+ qx[2] = vreinterpretq_s8_u32(uint32x4_t{iq1s_grid_us[x[4*ib+k].qs[ 4] | ((x[4*ib+k].qh[0] << 4) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[ 5] | ((x[4*ib+k].qh[1] << 4) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[ 6] | ((x[4*ib+k].qh[2] << 4) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[ 7] | ((x[4*ib+k].qh[3] << 4) & 0x0700)]});
+ qx[4] = vreinterpretq_s8_u32(uint32x4_t{iq1s_grid_us[x[4*ib+k].qs[ 8] | ((x[4*ib+k].qh[4] << 8) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[ 9] | ((x[4*ib+k].qh[5] << 8) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[10] | ((x[4*ib+k].qh[6] << 8) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[11] | ((x[4*ib+k].qh[7] << 8) & 0x0700)]});
+ qx[6] = vreinterpretq_s8_u32(uint32x4_t{iq1s_grid_us[x[4*ib+k].qs[12] | ((x[4*ib+k].qh[4] << 4) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[13] | ((x[4*ib+k].qh[5] << 4) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[14] | ((x[4*ib+k].qh[6] << 4) & 0x0700)],
+ iq1s_grid_us[x[4*ib+k].qs[15] | ((x[4*ib+k].qh[7] << 4) & 0x0700)]});
+ auto shuffle = shuffle0;
+ for (int j = 0; j < 4; ++j) {
+ auto s = vqtbl1q_s8(signs, shuffle);
+ qx[2*j+1] = vaddq_s8(s, vandq_s8(vshrq_n_s8(qx[2*j+0], 1), mask));
+ qx[2*j+0] = vaddq_s8(s, vandq_s8(vshlq_n_s8(qx[2*j+0], 3), mask));
+ shuffle = vaddq_u8(shuffle, step);
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = vld1q_s8_x2(q8.y[iy][ib].qs + 32*k);
+ auto sumi1 = vdupq_n_s32(0);
+ auto sumi2 = vdupq_n_s32(0);
+ sumi1 = vdotq_laneq_s32(sumi1, vreinterpretq_s8_u8(qx[0]), y.val[0], 0);
+ sumi1 = vdotq_laneq_s32(sumi1, vreinterpretq_s8_u8(qx[1]), y.val[0], 1);
+ sumi1 = vdotq_laneq_s32(sumi1, vreinterpretq_s8_u8(qx[2]), y.val[0], 2);
+ sumi1 = vdotq_laneq_s32(sumi1, vreinterpretq_s8_u8(qx[3]), y.val[0], 3);
+ sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[4]), y.val[1], 0);
+ sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[5]), y.val[1], 1);
+ sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[6]), y.val[1], 2);
+ sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[7]), y.val[1], 3);
+ auto sumi = vmlaq_s32(vmlaq_s32(vdupq_n_s32(0), sumi1, scales1), sumi2, scales2);
+ acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[4*iy+k]), vcvtq_f32_s32(sumi));
+ }
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ info.store(ix, iy, vmulq_f32(d1, acc[iy]));
+ acc[iy] = vdupq_n_f32(0.f);
+ }
+ }
+}
+
+template <int nrc_y>
static void mul_mat_iq2_s_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(nrc_x%4 == 0);
Q8<nrc_y, block_q8_K> q8(info);
@@ -13717,6 +13909,11 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) {
m.func16 = mul_mat_iq1_s_r4_q8_1<16>;
expected_Btype = GGML_TYPE_Q8_1_X4;
break;
+ case GGML_TYPE_IQ1_M_R4:
+ SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq1_m_r4_q8_0);
+ m.func16 = mul_mat_iq1_m_r4_q8_0<16>;
+ expected_Btype = GGML_TYPE_Q8_0_X4;
+ break;
case GGML_TYPE_IQ3_XXS_R4:
SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq3_xxs_r4_q8_k);
m.func16 = mul_mat_iq3_xxs_r4_q8_k<16>;
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index a8553b43..e741a8ea 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -6193,6 +6193,123 @@ void vec_dot_iq1_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
GGML_UNUSED(by);
}
+void quantize_row_iq1_m_r4_ref(const float * x, block_iq1_m_r4 * y, int64_t k) {
+ quantize_iq1_m_r4(x, y, 4, k/4, nullptr);
+}
+
+void quantize_row_iq1_m_r4(const float * x, void * y, int64_t k) {
+ quantize_iq1_m_r4(x, y, 4, k/4, nullptr);
+}
+
+size_t quantize_iq1_m_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+ constexpr int kBlockSize = 32;
+ GGML_ASSERT(nrows%4 == 0);
+ GGML_ASSERT(n_per_row%kBlockSize == 0);
+ int nblock = n_per_row/kBlockSize;
+ float weight[kBlockSize];
+ int8_t L[kBlockSize];
+ float pairs[2*kBlockSize];
+ float max[4];
+ uint16_t index[4];
+ int shift1, shift2;
+ float invd[4];
+ const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
+ std::vector<float> scales(8*nblock);
+ auto row_size = ggml_row_size(GGML_TYPE_IQ1_M_R4, n_per_row);
+ char * cy = (char *)dst;
+ for (int row = 0; row < nrows; row += 4) {
+ ggml_half * dptr = (ggml_half *)cy;
+ auto y = (block_iq1_m_r4 *)(dptr + 4);
+ for (int k = 0; k < 4; ++k) max[k] = 0;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ for (int k = 0; k < 4; ++k) {
+ auto xb = src + k*n_per_row + kBlockSize*ibl;
+ float sumx2 = 0;
+ for (int j = 0; j < kBlockSize; ++j) sumx2 += xb[j]*xb[j];
+ if (!sumx2) {
+ scales[8*ibl+2*k+0] = scales[8*ibl+2*k+1] = 0;
+ continue;
+ }
+ float sigma2 = 1.5f*sumx2/kBlockSize;
+ if (imatrix) {
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = imatrix[kBlockSize*ibl + j]*sqrt(sigma2 + xb[j]*xb[j]);
+ } else {
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = sqrt(sigma2 + xb[j]*xb[j]);
+ }
+ iq1m_process_1block(xb+ 0, weight+ 0, L, scales.data() + 8*ibl + 2*k+0, index+0, &shift1, pairs);
+ iq1m_process_1block(xb+16, weight+16, L, scales.data() + 8*ibl + 2*k+1, index+2, &shift2, pairs);
+ max[k] = std::max(max[k], std::max(scales[8*ibl+2*k+0], scales[8*ibl+2*k+1]));
+ for (int i = 0; i < 4; ++i) {
+ y[ibl].qs[4*i + k] = index[i] & 255;
+ }
+ for (int i = 0; i < 2; ++i) {
+ y[ibl].qh[4*i+k] = (index[2*i+0] >> 8) | ((index[2*i+1] >> 8) << 4);
+ }
+ y[ibl].qh[0+k] |= masks[shift1];
+ y[ibl].qh[4+k] |= masks[shift2];
+ }
+ }
+ for (int k = 0; k < 4; ++k) {
+ dptr[k] = GGML_FP32_TO_FP16(1.0625f*max[k]/15);;
+ invd[k] = max[k] ? 15/max[k] : 0.f;
+ }
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ for (int k = 0; k < 4; ++k) {
+ int ls1 = nearest_int(scales[8*ibl+2*k+0]*invd[k]);
+ int ls2 = nearest_int(scales[8*ibl+2*k+1]*invd[k]);
+ ls1 = std::max(0, std::min(15, ls1));
+ ls2 = std::max(0, std::min(15, ls2));
+ y[ibl].scales[k] = ls1 | (ls2 << 4);
+ }
+ }
+ cy += 4*row_size;
+ src += 4*n_per_row;
+ }
+ return nrows*row_size;
+}
+
+void dequantize_row_iq1_m_r4(const block_iq1_m_r4 * x, float * y, int64_t n) {
+ auto dptr = (const ggml_half *)x;
+ x = (const block_iq1_m_r4 *)(dptr + 4);
+ float d[4];
+ for (int k = 0; k < 4; ++k) d[k] = GGML_FP16_TO_FP32(dptr[k]);
+ int n_per_row = n/4;
+ GGML_ASSERT(n_per_row%32 == 0);
+ int nblock = n_per_row/32;
+ float dl[2];
+ float * yk[4];
+ for (int k = 0; k < 4; ++k) yk[k] = y + k*n_per_row;
+ for (int ib = 0; ib < nblock; ++ib) {
+ for (int k = 0; k < 4; ++k) {
+ dl[0] = d[k]*(x[ib].scales[k] & 0xf);
+ dl[1] = d[k]*(x[ib].scales[k] >> 4);
+ for (int i = 0; i < 2; ++i) {
+ auto idx1 = x[ib].qs[8*i+k+0] | ((x[ib].qh[4*i+k] & 0x07) << 8);
+ auto idx2 = x[ib].qs[8*i+k+4] | ((x[ib].qh[4*i+k] & 0x70) << 4);
+ auto grid1 = (const int8_t *)(iq1s_grid + idx1);
+ auto grid2 = (const int8_t *)(iq1s_grid + idx2);
+ auto delta1 = x[ib].qh[4*i+k] & 0x08 ? -IQ1M_DELTA : IQ1M_DELTA;
+ auto delta2 = x[ib].qh[4*i+k] & 0x80 ? -IQ1M_DELTA : IQ1M_DELTA;
+ for (int j = 0; j < 8; ++j) yk[k][32*ib + 16*i + j + 0] = dl[i]*(grid1[j] + delta1);
+ for (int j = 0; j < 8; ++j) yk[k][32*ib + 16*i + j + 8] = dl[i]*(grid2[j] + delta2);
+ }
+ }
+ }
+}
+
+void vec_dot_iq1_m_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+#if GGML_USE_IQK_MULMAT
+ if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ1_M_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
+ return;
+ }
+#endif
+ GGML_ASSERT(n%QK4_NL == 0);
+ GGML_ASSERT(nrc == 1);
+ GGML_UNUSED(bs);
+ GGML_UNUSED(bx);
+ GGML_UNUSED(by);
+}
+
//================================================
namespace {
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index 9a3c5dc6..0dbb88bd 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -205,6 +205,12 @@ size_t quantize_iq1_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT d
void dequantize_row_iq1_s_r4(const block_iq1_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void vec_dot_iq1_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void quantize_row_iq1_m_r4_ref(const float * GGML_RESTRICT x, block_iq1_m_r4 * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq1_m_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+size_t quantize_iq1_m_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+void dequantize_row_iq1_m_r4(const block_iq1_m_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void vec_dot_iq1_m_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
void quantize_row_q8_k_r8_ref(const float * GGML_RESTRICT x, block_q8_k_r8 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_k_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);