summaryrefslogtreecommitdiff
path: root/ggml/src/ggml-quants.c
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src/ggml-quants.c')
-rw-r--r--ggml/src/ggml-quants.c403
1 files changed, 179 insertions, 224 deletions
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 3c4711f3..d32a583f 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -14145,85 +14145,6 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
int best_shift;
iq1s_process_1block(block_size, xb, weight, L, &scales[ib], index, &best_shift, pairs, sumx, sumw);
-// float max = fabsf(xb[0]);
-// for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
-// if (max < GROUP_MAX_EPS_IQ1_S) {
-// scales[ib] = 0;
-// memset(L, 1, block_size);
-// continue;
-// }
-// // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
-// // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
-// // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
-// // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
-// // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
-// // for each possible and score for each split.
-// for (int j = 0; j < block_size; ++j) {
-// pairs[2*j] = xb[j];
-// idx[2*j] = j;
-// }
-// qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
-// {
-// sumx[0] = sumw[0] = 0;
-// for (int j = 0; j < block_size; ++j) {
-// int i = idx[2*j];
-// sumx[j+1] = sumx[j] + weight[i]*xb[i];
-// sumw[j+1] = sumw[j] + weight[i];
-// }
-// }
-// float best_score = -FLT_MIN, scale = max;
-// int besti1 = -1, besti2 = -1, best_shift = 0;
-// for (int i1 = 0; i1 <= block_size; ++i1) {
-// for (int i2 = i1; i2 <= block_size; ++i2) {
-// float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
-// float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
-// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
-// scale = sumqx/sumq2; best_score = scale*sumqx;
-// besti1 = i1; besti2 = i2; best_shift = 1;
-// }
-// sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
-// sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
-// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
-// scale = sumqx/sumq2; best_score = scale*sumqx;
-// besti1 = i1; besti2 = i2; best_shift = -1;
-// }
-// }
-// }
-// GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
-// for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
-// for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
-// for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
-// if (scale < 0) {
-// for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
-// scale = -scale; best_shift = -best_shift;
-// }
-// bool all_on_grid = true;
-// const float * xx = best_shift == 1 ? x_p : x_m;
-// for (int k = 0; k < block_size/8; ++k) {
-// uint16_t u = 0;
-// for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
-// int grid_index = kmap_q2xs[u];
-// if (grid_index < 0) {
-// all_on_grid = false;
-// const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-// grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
-// GGML_ASSERT(grid_index >= 0);
-// }
-// index[k] = grid_index;
-// }
-// if (!all_on_grid) {
-// float sumqx = 0, sumq2 = 0;
-// for (int k = 0; k < block_size/8; ++k) {
-// const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
-// for (int j = 0; j < 8; ++j) {
-// float w = weight[8*k + j];
-// float q = xx[(pg[j] - 1)/2];
-// sumqx += w*q*xb[8*k+j];
-// sumq2 += w*q*q;
-// }
-// }
-// if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
-// }
uint16_t h = 0;
for (int k = 0; k < block_size/8; ++k) {
y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
@@ -14232,10 +14153,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
y[ibl].qh[ib] = h;
GGML_ASSERT(scales[ib] >= 0);
max_scale = MAX(max_scale, scales[ib]);
- //GGML_ASSERT(scale >= 0);
- //scales[ib] = scale;
shifts[ib] = best_shift;
- //max_scale = MAX(max_scale, scale);
}
if (!max_scale) {
@@ -14287,6 +14205,166 @@ void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
quantize_row_iq1_s_ref(x, (block_iq1_s *)y, k);
}
+void iq1m_process_1block(const float * xb, const float * weight, int8_t * L, float * the_scale, uint16_t * the_index, int * the_shift,
+ float * pairs) {
+
+ const int block_size = IQ1M_BLOCK_SIZE;
+
+ const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA};
+ const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
+
+ float sumqx[4], sumq2[4];
+
+ const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
+
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
+ const int * kmap_q2xs = iq2_data[gindex].map;
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+
+ // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
+ // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
+ // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
+ // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
+ // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
+ // for each possible and score for each split.
+ int * idx = (int *)(pairs + 1);
+ for (int j = 0; j < block_size; ++j) {
+ pairs[2*j] = xb[j];
+ idx[2*j] = j;
+ }
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
+ float best_score = -FLT_MIN, scale = 0.f;
+ int besti1 = -1, besti2 = -1, best_k = -1;
+ // 0: +, +
+ // 1: +, -
+ // 2: -, +
+ // 3: -, -
+ for (int i1 = 0; i1 <= block_size; ++i1) {
+ for (int i2 = i1; i2 <= block_size; ++i2) {
+ memset(sumqx, 0, 4*sizeof(float));
+ memset(sumq2, 0, 4*sizeof(float));
+ for (int j = 0; j < i1; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
+ sumqx[1] += weight[i]*x_p[0]*xb[i];
+ sumqx[2] += weight[i]*x_m[0]*xb[i];
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
+ sumq2[1] += weight[i]*x_p[0]*x_p[0];
+ sumq2[2] += weight[i]*x_m[0]*x_m[0];
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
+ } else {
+ sumqx[0] += weight[i]*x_p[0]*xb[i];
+ sumqx[2] += weight[i]*x_p[0]*xb[i];
+ sumqx[1] += weight[i]*x_m[0]*xb[i];
+ sumqx[3] += weight[i]*x_m[0]*xb[i];
+ sumq2[0] += weight[i]*x_p[0]*x_p[0];
+ sumq2[2] += weight[i]*x_p[0]*x_p[0];
+ sumq2[1] += weight[i]*x_m[0]*x_m[0];
+ sumq2[3] += weight[i]*x_m[0]*x_m[0];
+ }
+ }
+ for (int j = i1; j < i2; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
+ sumqx[1] += weight[i]*x_p[1]*xb[i];
+ sumqx[2] += weight[i]*x_m[1]*xb[i];
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
+ sumq2[1] += weight[i]*x_p[1]*x_p[1];
+ sumq2[2] += weight[i]*x_m[1]*x_m[1];
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
+ } else {
+ sumqx[0] += weight[i]*x_p[1]*xb[i];
+ sumqx[2] += weight[i]*x_p[1]*xb[i];
+ sumqx[1] += weight[i]*x_m[1]*xb[i];
+ sumqx[3] += weight[i]*x_m[1]*xb[i];
+ sumq2[0] += weight[i]*x_p[1]*x_p[1];
+ sumq2[2] += weight[i]*x_p[1]*x_p[1];
+ sumq2[1] += weight[i]*x_m[1]*x_m[1];
+ sumq2[3] += weight[i]*x_m[1]*x_m[1];
+ }
+ }
+ for (int j = i2; j < block_size; ++j) {
+ int i = idx[2*j];
+ if (i < block_size/2) {
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
+ sumqx[1] += weight[i]*x_p[2]*xb[i];
+ sumqx[2] += weight[i]*x_m[2]*xb[i];
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
+ sumq2[1] += weight[i]*x_p[2]*x_p[2];
+ sumq2[2] += weight[i]*x_m[2]*x_m[2];
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
+ } else {
+ sumqx[0] += weight[i]*x_p[2]*xb[i];
+ sumqx[2] += weight[i]*x_p[2]*xb[i];
+ sumqx[1] += weight[i]*x_m[2]*xb[i];
+ sumqx[3] += weight[i]*x_m[2]*xb[i];
+ sumq2[0] += weight[i]*x_p[2]*x_p[2];
+ sumq2[2] += weight[i]*x_p[2]*x_p[2];
+ sumq2[1] += weight[i]*x_m[2]*x_m[2];
+ sumq2[3] += weight[i]*x_m[2]*x_m[2];
+ }
+ }
+ for (int k = 0; k < 4; ++k) {
+ if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
+ scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
+ besti1 = i1; besti2 = i2; best_k = k;
+ }
+ }
+ }
+ }
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
+ for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
+ for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
+ if (scale < 0) {
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
+ scale = -scale;
+ best_k = 3 - best_k;
+ }
+ bool all_on_grid = true;
+ const float * xx;
+ for (int k = 0; k < block_size/8; ++k) {
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
+ else xx = best_k%2 == 0 ? x_p : x_m;
+ uint16_t u = 0;
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
+ int grid_index = kmap_q2xs[u];
+ if (grid_index < 0) {
+ all_on_grid = false;
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
+ GGML_ASSERT(grid_index >= 0);
+ }
+ the_index[k] = grid_index;
+ }
+ if (!all_on_grid) {
+ float sumqx_f = 0, sumq2_f = 0;
+ for (int k = 0; k < block_size/8; ++k) {
+ if (k == 0) xx = best_k < 2 ? x_p : x_m;
+ else xx = best_k%2 == 0 ? x_p : x_m;
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + the_index[k]);
+ for (int j = 0; j < 8; ++j) {
+ float w = weight[8*k + j];
+ float q = xx[(pg[j] - 1)/2];
+ sumqx_f += w*q*xb[8*k+j];
+ sumq2_f += w*q*q;
+ }
+ }
+ if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
+ }
+ *the_scale = scale;
+ *the_shift = best_k;
+}
+
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
float * scales,
float * weight,
@@ -14301,7 +14379,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
- //GGML_ASSERT(quant_weights && "missing quantization weights");
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
@@ -14317,10 +14394,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
- int * idx = (int *)(pairs + 1);
-
- float sumqx[4], sumq2[4];
-
iq1m_scale_t s;
const float * xx;
@@ -14351,147 +14424,15 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
memset(L, 1, block_size);
continue;
}
- // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
- // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
- // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
- // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
- // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
- // for each possible and score for each split.
- for (int j = 0; j < block_size; ++j) {
- pairs[2*j] = xb[j];
- idx[2*j] = j;
- }
- qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
- float best_score = -FLT_MIN, scale = max;
- int besti1 = -1, besti2 = -1, best_k = -1;
- // 0: +, +
- // 1: +, -
- // 2: -, +
- // 3: -, -
- for (int i1 = 0; i1 <= block_size; ++i1) {
- for (int i2 = i1; i2 <= block_size; ++i2) {
- memset(sumqx, 0, 4*sizeof(float));
- memset(sumq2, 0, 4*sizeof(float));
- for (int j = 0; j < i1; ++j) {
- int i = idx[2*j];
- if (i < block_size/2) {
- sumqx[0] += weight[i]*x_p[0]*xb[i];
- sumqx[1] += weight[i]*x_p[0]*xb[i];
- sumqx[2] += weight[i]*x_m[0]*xb[i];
- sumqx[3] += weight[i]*x_m[0]*xb[i];
- sumq2[0] += weight[i]*x_p[0]*x_p[0];
- sumq2[1] += weight[i]*x_p[0]*x_p[0];
- sumq2[2] += weight[i]*x_m[0]*x_m[0];
- sumq2[3] += weight[i]*x_m[0]*x_m[0];
- } else {
- sumqx[0] += weight[i]*x_p[0]*xb[i];
- sumqx[2] += weight[i]*x_p[0]*xb[i];
- sumqx[1] += weight[i]*x_m[0]*xb[i];
- sumqx[3] += weight[i]*x_m[0]*xb[i];
- sumq2[0] += weight[i]*x_p[0]*x_p[0];
- sumq2[2] += weight[i]*x_p[0]*x_p[0];
- sumq2[1] += weight[i]*x_m[0]*x_m[0];
- sumq2[3] += weight[i]*x_m[0]*x_m[0];
- }
- }
- for (int j = i1; j < i2; ++j) {
- int i = idx[2*j];
- if (i < block_size/2) {
- sumqx[0] += weight[i]*x_p[1]*xb[i];
- sumqx[1] += weight[i]*x_p[1]*xb[i];
- sumqx[2] += weight[i]*x_m[1]*xb[i];
- sumqx[3] += weight[i]*x_m[1]*xb[i];
- sumq2[0] += weight[i]*x_p[1]*x_p[1];
- sumq2[1] += weight[i]*x_p[1]*x_p[1];
- sumq2[2] += weight[i]*x_m[1]*x_m[1];
- sumq2[3] += weight[i]*x_m[1]*x_m[1];
- } else {
- sumqx[0] += weight[i]*x_p[1]*xb[i];
- sumqx[2] += weight[i]*x_p[1]*xb[i];
- sumqx[1] += weight[i]*x_m[1]*xb[i];
- sumqx[3] += weight[i]*x_m[1]*xb[i];
- sumq2[0] += weight[i]*x_p[1]*x_p[1];
- sumq2[2] += weight[i]*x_p[1]*x_p[1];
- sumq2[1] += weight[i]*x_m[1]*x_m[1];
- sumq2[3] += weight[i]*x_m[1]*x_m[1];
- }
- }
- for (int j = i2; j < block_size; ++j) {
- int i = idx[2*j];
- if (i < block_size/2) {
- sumqx[0] += weight[i]*x_p[2]*xb[i];
- sumqx[1] += weight[i]*x_p[2]*xb[i];
- sumqx[2] += weight[i]*x_m[2]*xb[i];
- sumqx[3] += weight[i]*x_m[2]*xb[i];
- sumq2[0] += weight[i]*x_p[2]*x_p[2];
- sumq2[1] += weight[i]*x_p[2]*x_p[2];
- sumq2[2] += weight[i]*x_m[2]*x_m[2];
- sumq2[3] += weight[i]*x_m[2]*x_m[2];
- } else {
- sumqx[0] += weight[i]*x_p[2]*xb[i];
- sumqx[2] += weight[i]*x_p[2]*xb[i];
- sumqx[1] += weight[i]*x_m[2]*xb[i];
- sumqx[3] += weight[i]*x_m[2]*xb[i];
- sumq2[0] += weight[i]*x_p[2]*x_p[2];
- sumq2[2] += weight[i]*x_p[2]*x_p[2];
- sumq2[1] += weight[i]*x_m[2]*x_m[2];
- sumq2[3] += weight[i]*x_m[2]*x_m[2];
- }
- }
- for (int k = 0; k < 4; ++k) {
- if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
- scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
- besti1 = i1; besti2 = i2; best_k = k;
- }
- }
- }
- }
- GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
- for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
- for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
- for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
- if (scale < 0) {
- for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
- scale = -scale;
- best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
- }
- bool all_on_grid = true;
- for (int k = 0; k < block_size/8; ++k) {
- if (k == 0) xx = best_k < 2 ? x_p : x_m;
- else xx = best_k%2 == 0 ? x_p : x_m;
- uint16_t u = 0;
- for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
- int grid_index = kmap_q2xs[u];
- if (grid_index < 0) {
- all_on_grid = false;
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
- grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
- GGML_ASSERT(grid_index >= 0);
- }
- index[k] = grid_index;
- }
- if (!all_on_grid) {
- float sumqx_f = 0, sumq2_f = 0;
- for (int k = 0; k < block_size/8; ++k) {
- if (k == 0) xx = best_k < 2 ? x_p : x_m;
- else xx = best_k%2 == 0 ? x_p : x_m;
- const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
- for (int j = 0; j < 8; ++j) {
- float w = weight[8*k + j];
- float q = xx[(pg[j] - 1)/2];
- sumqx_f += w*q*xb[8*k+j];
- sumq2_f += w*q*q;
- }
- }
- if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
- }
+
+ int best_k = -1;
+ iq1m_process_1block(xb, weight, L, &scales[ib], index, &best_k, pairs);
+
y[ibl].qs[2*ib + 0] = index[0] & 255;
y[ibl].qs[2*ib + 1] = index[1] & 255;
y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
- GGML_ASSERT(scale >= 0);
- scales[ib] = scale;
shifts[ib] = best_k;
- max_scale = MAX(max_scale, scale);
+ max_scale = MAX(max_scale, scales[ib]);
}
if (!max_scale) {
@@ -14553,6 +14494,19 @@ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t n
return nrow * nblock * sizeof(block_iq1_m);
}
+void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_m * GGML_RESTRICT y, int64_t k) {
+ int nblock = k/QK_K;
+ float qw[QK_K];
+ for (int j = 0; j < QK_K; ++j) qw[j] = 1;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);
+ }
+}
+
+void quantize_row_iq1_m (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+ quantize_row_iq1_m_ref(x, (block_iq1_m *)y, k);
+}
+
// ============================ 4-bit non-linear quants
static const int8_t iq4nl_index[241] = {
@@ -15246,6 +15200,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
case GGML_TYPE_IQ3_S_R4: break;
case GGML_TYPE_IQ2_S_R4: break;
case GGML_TYPE_IQ1_S_R4: break;
+ case GGML_TYPE_IQ1_M_R4: break;
case GGML_TYPE_Q4_0_R4: break;
case GGML_TYPE_Q5_0_R4: break;
case GGML_TYPE_Q6_0_R4: break;