summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-02-05 13:49:39 +0200
committerGitHub <noreply@github.com>2025-02-05 13:49:39 +0200
commit8b7536bda8b65107794c4df710f14ddfde430160 (patch)
tree97a9dea70458bddcef51c734e22026ac51b51ed7
parentecf111a11ca56ff0731308f94bd6c5e96658b6ef (diff)
IQ1_S_R4: better 1.5 bpw quants (#185)
* iq1_s_r4: basics - quantize/dequantize * iq1_s_r4: gemm/gemv works on AVX2/Zen4 * Don't forget to make sure we have a multiple of 4 rows per thread * iq1_s_r4: this is better * iq1_s_r4: fix Zen4 after AVX2 changes * iq1_s_r4: NEON gemm/gemv * iq1_s_r4: more bits for shared experts With this mix we arrive at PPL(512) = 9.4140 for Deepseek-Lite using 1.766 bpw for the repeating layers. On the Ryzen-7950X we get PP-512 = 494 t/s and TG-128 = 52 t/s @ 16 threads. * Forgotten counter increment * iq1_s_r4: slightly faster AVX2/Zen4 gemm/gemv * Compiler warnings --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
-rw-r--r--examples/quantize/quantize.cpp2
-rw-r--r--ggml/include/ggml.h2
-rw-r--r--ggml/src/ggml-common.h6
-rw-r--r--ggml/src/ggml-quants.c289
-rw-r--r--ggml/src/ggml-quants.h5
-rw-r--r--ggml/src/ggml.c27
-rw-r--r--ggml/src/iqk/iqk_mul_mat.cpp703
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp106
-rw-r--r--ggml/src/iqk/iqk_quantize.h6
-rw-r--r--include/llama.h1
-rw-r--r--src/llama.cpp50
11 files changed, 1104 insertions, 93 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 5ffdbc84..1c847e6b 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -29,6 +29,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
{ "IQ2_M_R4", LLAMA_FTYPE_MOSTLY_IQ2_M_R4, " 2.7 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
+ { "IQ1_S_R4", LLAMA_FTYPE_MOSTLY_IQ1_S_R4, " 1.5 bpw quantization", },
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", },
{ "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", },
@@ -510,6 +511,7 @@ int main(int argc, char ** argv) {
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 ||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
+ params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 ||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) {
fprintf(stderr, "\n==========================================================================================================\n");
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 5eea7dcd..9668dc32 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -427,6 +427,7 @@ extern "C" {
GGML_TYPE_IQ2_XXS_R4= 216,
GGML_TYPE_IQ2_XS_R4 = 217,
GGML_TYPE_IQ3_XXS_R4= 218,
+ GGML_TYPE_IQ1_S_R4 = 219,
GGML_TYPE_IQ4_NL_R4 = 220,
GGML_TYPE_IQ3_S_R4 = 221,
GGML_TYPE_IQ2_S_R4 = 222,
@@ -510,6 +511,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ2_XXS_R4= 215, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XS_R4 = 216, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_XXS_R4= 217, // except 1d tensors
+ GGML_FTYPE_MOSTLY_IQ1_S_R4 = 218, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_NL_R4 = 219, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_S_R4 = 220, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_S_R4 = 221, // except 1d tensors
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 023b0b63..14813161 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -485,6 +485,12 @@ typedef struct {
} block_iq1_s;
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
+typedef struct {
+ uint8_t qs[16];
+ uint16_t qh[4];
+} block_iq1_s_r4;
+static_assert(sizeof(block_iq1_s_r4) == 24, "wrong iq1_s_r4 block size/padding");
+
// 1.75 bpw
typedef struct {
uint8_t qs[QK_K/8]; // grid index, low 8 bits
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 391d9e2e..3c4711f3 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -13991,6 +13991,105 @@ static int iq1_sort_helper(const void * left, const void * right) {
return *l < *r ? -1 : *l > *r ? 1 : 0;
}
+void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L, float * the_scale, uint16_t * the_index, int * the_shift,
+ float * pairs, float * sumx, float * sumw) {
+ float max = fabsf(xb[0]);
+ for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
+ if (max < GROUP_MAX_EPS_IQ1_S) {
+ *the_scale = 0;
+ *the_shift = 1;
+ for (int k = 0; k < block_size/8; ++k) the_index[k] = 0;
+ return;
+ }
+ const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
+ const int * kmap_q2xs = iq2_data[gindex].map;
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+ GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
+ GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+
+ const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
+ const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
+
+ // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
+ // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
+ // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
+ // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
+ // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
+ // for each possible and score for each split.
+ int * idx = (int *)(pairs + 1);
+ for (int j = 0; j < block_size; ++j) {
+ pairs[2*j] = xb[j];
+ idx[2*j] = j;
+ }
+ qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
+ {
+ sumx[0] = sumw[0] = 0;
+ for (int j = 0; j < block_size; ++j) {
+ int i = idx[2*j];
+ sumx[j+1] = sumx[j] + weight[i]*xb[i];
+ sumw[j+1] = sumw[j] + weight[i];
+ }
+ }
+ float best_score = -FLT_MIN, scale = max;
+ int besti1 = -1, besti2 = -1, best_shift = 0;
+ for (int i1 = 0; i1 <= block_size; ++i1) {
+ for (int i2 = i1; i2 <= block_size; ++i2) {
+ float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
+ float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
+ if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
+ scale = sumqx/sumq2; best_score = scale*sumqx;
+ besti1 = i1; besti2 = i2; best_shift = 1;
+ }
+ sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
+ sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
+ if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
+ scale = sumqx/sumq2; best_score = scale*sumqx;
+ besti1 = i1; besti2 = i2; best_shift = -1;
+ }
+ }
+ }
+ GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
+ for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
+ for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
+ for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
+ if (scale < 0) {
+ for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
+ scale = -scale; best_shift = -best_shift;
+ }
+ bool all_on_grid = true;
+ const float * xx = best_shift == 1 ? x_p : x_m;
+ for (int k = 0; k < block_size/8; ++k) {
+ uint16_t u = 0;
+ for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
+ int grid_index = kmap_q2xs[u];
+ if (grid_index < 0) {
+ all_on_grid = false;
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+ grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
+ GGML_ASSERT(grid_index >= 0);
+ }
+ the_index[k] = grid_index;
+ }
+ if (!all_on_grid) {
+ float sumqx = 0, sumq2 = 0;
+ for (int k = 0; k < block_size/8; ++k) {
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + the_index[k]);
+ for (int j = 0; j < 8; ++j) {
+ float w = weight[8*k + j];
+ float q = xx[(pg[j] - 1)/2];
+ sumqx += w*q*xb[8*k+j];
+ sumq2 += w*q*q;
+ }
+ }
+ if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
+ }
+ *the_scale = scale;
+ *the_shift = best_shift;
+}
+
#define IQ1S_BLOCK_SIZE 32
#define IQ1M_BLOCK_SIZE 16
static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
@@ -14021,11 +14120,10 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
const int block_size = IQ1S_BLOCK_SIZE;
- const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
- const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
-
+ //const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
+ //const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
- int * idx = (int *)(pairs + 1);
+ //int * idx = (int *)(pairs + 1);
for (int ibl = 0; ibl < nbl; ++ibl) {
@@ -14044,95 +14142,100 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
const float * xb = xbl + block_size*ib;
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
- float max = fabsf(xb[0]);
- for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
- if (max < GROUP_MAX_EPS_IQ1_S) {
- scales[ib] = 0;
- memset(L, 1, block_size);
- continue;
- }
- // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
- // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
- // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
- // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
- // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
- // for each possible and score for each split.
- for (int j = 0; j < block_size; ++j) {
- pairs[2*j] = xb[j];
- idx[2*j] = j;
- }
- qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
- {
- sumx[0] = sumw[0] = 0;
- for (int j = 0; j < block_size; ++j) {
- int i = idx[2*j];
- sumx[j+1] = sumx[j] + weight[i]*xb[i];
- sumw[j+1] = sumw[j] + weight[i];
- }
- }
- float best_score = -FLT_MIN, scale = max;
- int besti1 = -1, besti2 = -1, best_shift = 0;
- for (int i1 = 0; i1 <= block_size; ++i1) {
- for (int i2 = i1; i2 <= block_size; ++i2) {
- float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
- float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
- if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
- scale = sumqx/sumq2; best_score = scale*sumqx;
- besti1 = i1; besti2 = i2; best_shift = 1;
- }
- sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
- sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
- if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
- scale = sumqx/sumq2; best_score = scale*sumqx;
- besti1 = i1; besti2 = i2; best_shift = -1;
- }
- }
- }
- GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
- for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
- for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
- for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
- if (scale < 0) {
- for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
- scale = -scale; best_shift = -best_shift;
- }
- bool all_on_grid = true;
- const float * xx = best_shift == 1 ? x_p : x_m;
- for (int k = 0; k < block_size/8; ++k) {
- uint16_t u = 0;
- for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
- int grid_index = kmap_q2xs[u];
- if (grid_index < 0) {
- all_on_grid = false;
- const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
- grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
- GGML_ASSERT(grid_index >= 0);
- }
- index[k] = grid_index;
- }
- if (!all_on_grid) {
- float sumqx = 0, sumq2 = 0;
- for (int k = 0; k < block_size/8; ++k) {
- const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
- for (int j = 0; j < 8; ++j) {
- float w = weight[8*k + j];
- float q = xx[(pg[j] - 1)/2];
- sumqx += w*q*xb[8*k+j];
- sumq2 += w*q*q;
- }
- }
- if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
- }
+ int best_shift;
+ iq1s_process_1block(block_size, xb, weight, L, &scales[ib], index, &best_shift, pairs, sumx, sumw);
+
+// float max = fabsf(xb[0]);
+// for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
+// if (max < GROUP_MAX_EPS_IQ1_S) {
+// scales[ib] = 0;
+// memset(L, 1, block_size);
+// continue;
+// }
+// // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
+// // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
+// // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
+// // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
+// // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
+// // for each possible and score for each split.
+// for (int j = 0; j < block_size; ++j) {
+// pairs[2*j] = xb[j];
+// idx[2*j] = j;
+// }
+// qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
+// {
+// sumx[0] = sumw[0] = 0;
+// for (int j = 0; j < block_size; ++j) {
+// int i = idx[2*j];
+// sumx[j+1] = sumx[j] + weight[i]*xb[i];
+// sumw[j+1] = sumw[j] + weight[i];
+// }
+// }
+// float best_score = -FLT_MIN, scale = max;
+// int besti1 = -1, besti2 = -1, best_shift = 0;
+// for (int i1 = 0; i1 <= block_size; ++i1) {
+// for (int i2 = i1; i2 <= block_size; ++i2) {
+// float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
+// float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
+// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
+// scale = sumqx/sumq2; best_score = scale*sumqx;
+// besti1 = i1; besti2 = i2; best_shift = 1;
+// }
+// sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
+// sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
+// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
+// scale = sumqx/sumq2; best_score = scale*sumqx;
+// besti1 = i1; besti2 = i2; best_shift = -1;
+// }
+// }
+// }
+// GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
+// for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
+// for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
+// for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
+// if (scale < 0) {
+// for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
+// scale = -scale; best_shift = -best_shift;
+// }
+// bool all_on_grid = true;
+// const float * xx = best_shift == 1 ? x_p : x_m;
+// for (int k = 0; k < block_size/8; ++k) {
+// uint16_t u = 0;
+// for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
+// int grid_index = kmap_q2xs[u];
+// if (grid_index < 0) {
+// all_on_grid = false;
+// const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+// grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
+// GGML_ASSERT(grid_index >= 0);
+// }
+// index[k] = grid_index;
+// }
+// if (!all_on_grid) {
+// float sumqx = 0, sumq2 = 0;
+// for (int k = 0; k < block_size/8; ++k) {
+// const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
+// for (int j = 0; j < 8; ++j) {
+// float w = weight[8*k + j];
+// float q = xx[(pg[j] - 1)/2];
+// sumqx += w*q*xb[8*k+j];
+// sumq2 += w*q*q;
+// }
+// }
+// if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
+// }
uint16_t h = 0;
for (int k = 0; k < block_size/8; ++k) {
y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
h |= (index[k] >> 8) << 3*k;
}
y[ibl].qh[ib] = h;
- GGML_ASSERT(scale >= 0);
- scales[ib] = scale;
+ GGML_ASSERT(scales[ib] >= 0);
+ max_scale = MAX(max_scale, scales[ib]);
+ //GGML_ASSERT(scale >= 0);
+ //scales[ib] = scale;
shifts[ib] = best_shift;
- max_scale = MAX(max_scale, scale);
+ //max_scale = MAX(max_scale, scale);
}
if (!max_scale) {
@@ -14171,6 +14274,19 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n
return nrow * nblock * sizeof(block_iq1_s);
}
+void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k) {
+ int nblock = k/QK_K;
+ float qw[QK_K];
+ for (int j = 0; j < QK_K; ++j) qw[j] = 1;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);
+ }
+}
+
+void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+ quantize_row_iq1_s_ref(x, (block_iq1_s *)y, k);
+}
+
static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
float * scales,
float * weight,
@@ -15129,6 +15245,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
case GGML_TYPE_IQ3_XXS_R4: break;
case GGML_TYPE_IQ3_S_R4: break;
case GGML_TYPE_IQ2_S_R4: break;
+ case GGML_TYPE_IQ1_S_R4: break;
case GGML_TYPE_Q4_0_R4: break;
case GGML_TYPE_Q5_0_R4: break;
case GGML_TYPE_Q6_0_R4: break;
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index b6d69011..4753f342 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -42,6 +42,7 @@ void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGM
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -66,6 +67,7 @@ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
// Dequantization
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -148,6 +150,9 @@ void iq2xs_free_impl(enum ggml_type type);
void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size);
+void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L,
+ float * the_scale, uint16_t * the_index, int * the_shift, float * pairs, float * sumx, float * sumw);
+
#if defined(__ARM_FEATURE_SVE)
extern int ggml_sve_cnt_b;
#endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b3c8a951..64b7d3ce 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1176,13 +1176,26 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq1_s),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
- .from_float = NULL,
- .from_float_ref = NULL,
+ .from_float = quantize_row_iq1_s,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq1_s_ref,
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.row_meta_size = 0,
},
+ [GGML_TYPE_IQ1_S_R4] = {
+ .type_name = "iq1_s_r4",
+ .blck_size = 32,
+ .type_size = sizeof(block_iq1_s_r4)/4,
+ .is_quantized = true,
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_s_r4,
+ .from_float = quantize_row_iq1_s_r4,
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq1_s_r4_ref,
+ .vec_dot = vec_dot_iq1_s_r4_q8_k,
+ .vec_dot_type = GGML_TYPE_Q8_1_X4,
+ .nrows = 1,
+ .row_meta_size = 2,
+ },
[GGML_TYPE_IQ1_M] = {
.type_name = "iq1_m",
.blck_size = QK_K,
@@ -4387,6 +4400,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ3_S_R4: wtype = GGML_TYPE_IQ3_S_R4; break;
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
case GGML_FTYPE_MOSTLY_IQ2_S_R4: wtype = GGML_TYPE_IQ2_S_R4; break;
+ case GGML_FTYPE_MOSTLY_IQ1_S_R4: wtype = GGML_TYPE_IQ1_S_R4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
@@ -10934,6 +10948,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -11402,6 +11417,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -11567,6 +11583,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -14805,6 +14822,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -15210,6 +15228,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -15509,6 +15528,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
@@ -16137,6 +16157,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_S_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_Q8_K:
case GGML_TYPE_Q8_K64:
case GGML_TYPE_Q8_K16:
@@ -22893,6 +22914,7 @@ void ggml_quantize_init(enum ggml_type type) {
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
+ case GGML_TYPE_IQ1_S_R4:iq2xs_init_impl(GGML_TYPE_IQ1_S); break;
case GGML_TYPE_IQ3_XXS_R4:
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
case GGML_TYPE_IQ3_S_R4:
@@ -22975,6 +22997,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ3_S_R4:result = quantize_iq3_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+ case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index f633229d..559cff05 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -259,6 +259,7 @@ struct MulMat {
case GGML_TYPE_IQ2_XS_R4:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ3_XXS_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_IQ3_S_R4: return 4;
case GGML_TYPE_IQ4_NL_R4:
case GGML_TYPE_Q5_0_R4:
@@ -293,6 +294,7 @@ struct MulMat {
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ3_XXS_R4:
case GGML_TYPE_IQ3_S_R4:
+ case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_IQ2_BN_R4: return 4;
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
@@ -375,6 +377,523 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
aux32[0] = a0 & 0x3f3f3f3f;
}
+#ifdef __AVX2__
+static const uint64_t iq1s_grid_us[2048] = {
+ 0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200,
+ 0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000,
+ 0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101,
+ 0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101,
+ 0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202,
+ 0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200,
+ 0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001,
+ 0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202,
+ 0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201,
+ 0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001,
+ 0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101,
+ 0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101,
+ 0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202,
+ 0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200,
+ 0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201,
+ 0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002,
+ 0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101,
+ 0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200,
+ 0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102,
+ 0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101,
+ 0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001,
+ 0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100,
+ 0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200,
+ 0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101,
+ 0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100,
+ 0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000,
+ 0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202,
+ 0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200,
+ 0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101,
+ 0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201,
+ 0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002,
+ 0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001,
+ 0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001,
+ 0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002,
+ 0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000,
+ 0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101,
+ 0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000,
+ 0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101,
+ 0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202,
+ 0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201,
+ 0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000,
+ 0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100,
+ 0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102,
+ 0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002,
+ 0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000,
+ 0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101,
+ 0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101,
+ 0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200,
+ 0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002,
+ 0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001,
+ 0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101,
+ 0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101,
+ 0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101,
+ 0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102,
+ 0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100,
+ 0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002,
+ 0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100,
+ 0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000,
+ 0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101,
+ 0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101,
+ 0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001,
+ 0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102,
+ 0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201,
+ 0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202,
+ 0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001,
+ 0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001,
+ 0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101,
+ 0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102,
+ 0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200,
+ 0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
+ 0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101,
+ 0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000,
+ 0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201,
+ 0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101,
+ 0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202,
+ 0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102,
+ 0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101,
+ 0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100,
+ 0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002,
+ 0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201,
+ 0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101,
+ 0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002,
+ 0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202,
+ 0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101,
+ 0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000,
+ 0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100,
+ 0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102,
+ 0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102,
+ 0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101,
+ 0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101,
+ 0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001,
+ 0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201,
+ 0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002,
+ 0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001,
+ 0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100,
+ 0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101,
+ 0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001,
+ 0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101,
+ 0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000,
+ 0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001,
+ 0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101,
+ 0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101,
+ 0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000,
+ 0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001,
+ 0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001,
+ 0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102,
+ 0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102,
+ 0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101,
+ 0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201,
+ 0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202,
+ 0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202,
+ 0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101,
+ 0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001,
+ 0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000,
+ 0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101,
+ 0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200,
+ 0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100,
+ 0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100,
+ 0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202,
+ 0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102,
+ 0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201,
+ 0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202,
+ 0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002,
+ 0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001,
+ 0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001,
+ 0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101,
+ 0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202,
+ 0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201,
+ 0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102,
+ 0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200,
+ 0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001,
+ 0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101,
+ 0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201,
+ 0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001,
+ 0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002,
+ 0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000,
+ 0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202,
+ 0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201,
+ 0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201,
+ 0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101,
+ 0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100,
+ 0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000,
+ 0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101,
+ 0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202,
+ 0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101,
+ 0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202,
+ 0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202,
+ 0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201,
+ 0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002,
+ 0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102,
+ 0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102,
+ 0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000,
+ 0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000,
+ 0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101,
+ 0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101,
+ 0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202,
+ 0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200,
+ 0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102,
+ 0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101,
+ 0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100,
+ 0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001,
+ 0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100,
+ 0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101,
+ 0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001,
+ 0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200,
+ 0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
+ 0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101,
+ 0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100,
+ 0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101,
+ 0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101,
+ 0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101,
+ 0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202,
+ 0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100,
+ 0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201,
+ 0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202,
+ 0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102,
+ 0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200,
+ 0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201,
+ 0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000,
+ 0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002,
+ 0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100,
+ 0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000,
+ 0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100,
+ 0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000,
+ 0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102,
+ 0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100,
+ 0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002,
+ 0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001,
+ 0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201,
+ 0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202,
+ 0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100,
+ 0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001,
+ 0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002,
+ 0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001,
+ 0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201,
+ 0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001,
+ 0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101,
+ 0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101,
+ 0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101,
+ 0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101,
+ 0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102,
+ 0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100,
+ 0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001,
+ 0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000,
+ 0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001,
+ 0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101,
+ 0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100,
+ 0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000,
+ 0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202,
+ 0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101,
+ 0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100,
+ 0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100,
+ 0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200,
+ 0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100,
+ 0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
+ 0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101,
+ 0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201,
+ 0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001,
+ 0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201,
+ 0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201,
+ 0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001,
+ 0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200,
+ 0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100,
+ 0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201,
+ 0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200,
+ 0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
+ 0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001,
+ 0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102,
+ 0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001,
+ 0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201,
+ 0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100,
+ 0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000,
+ 0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102,
+ 0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001,
+ 0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202,
+ 0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102,
+ 0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101,
+ 0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201,
+ 0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101,
+ 0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102,
+ 0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101,
+ 0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100,
+ 0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202,
+ 0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
+ 0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202,
+ 0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101,
+ 0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200,
+ 0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101,
+ 0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100,
+ 0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002,
+ 0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201,
+ 0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100,
+ 0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202,
+ 0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102,
+ 0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002,
+ 0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200,
+ 0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002,
+ 0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200,
+ 0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001,
+ 0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200,
+ 0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100,
+ 0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000,
+ 0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102,
+ 0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100,
+ 0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000,
+ 0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102,
+ 0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100,
+ 0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000,
+ 0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101,
+ 0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001,
+ 0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201,
+ 0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002,
+ 0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200,
+ 0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100,
+ 0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101,
+ 0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202,
+ 0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002,
+ 0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201,
+ 0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201,
+ 0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001,
+ 0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202,
+ 0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102,
+ 0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002,
+ 0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201,
+ 0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200,
+ 0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002,
+ 0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100,
+ 0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101,
+ 0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102,
+ 0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002,
+ 0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200,
+ 0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100,
+ 0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001,
+ 0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100,
+ 0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201,
+ 0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101,
+ 0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102,
+ 0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201,
+ 0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200,
+ 0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200,
+ 0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002,
+ 0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202,
+ 0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102,
+ 0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000,
+ 0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202,
+ 0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201,
+ 0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001,
+ 0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002,
+ 0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102,
+ 0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001,
+ 0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101,
+ 0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202,
+ 0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102,
+ 0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201,
+ 0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101,
+ 0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101,
+ 0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001,
+ 0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202,
+ 0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000,
+ 0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202,
+ 0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102,
+ 0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002,
+ 0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201,
+ 0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101,
+ 0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001,
+ 0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200,
+ 0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102,
+ 0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102,
+ 0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100,
+ 0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001,
+ 0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201,
+ 0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001,
+ 0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202,
+ 0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200,
+ 0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000,
+ 0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000,
+ 0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001,
+ 0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200,
+ 0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200,
+ 0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202,
+ 0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201,
+ 0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202,
+ 0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001,
+ 0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001,
+ 0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200,
+ 0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000,
+ 0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102,
+ 0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101,
+ 0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100,
+ 0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000,
+ 0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100,
+ 0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100,
+ 0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102,
+ 0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201,
+ 0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202,
+ 0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102,
+ 0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102,
+ 0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202,
+ 0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202,
+ 0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100,
+ 0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000,
+ 0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101,
+ 0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202,
+ 0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102,
+ 0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100,
+ 0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101,
+ 0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100,
+ 0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201,
+ 0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101,
+ 0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202,
+ 0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200,
+ 0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201,
+ 0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200,
+ 0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002,
+ 0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201,
+ 0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101,
+ 0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201,
+ 0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201,
+ 0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102,
+ 0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101,
+ 0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101,
+ 0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101,
+ 0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001,
+ 0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000,
+ 0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102,
+ 0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101,
+ 0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202,
+ 0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202,
+ 0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101,
+ 0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000,
+ 0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101,
+ 0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202,
+ 0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100,
+ 0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000,
+ 0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101,
+ 0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202,
+ 0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100,
+ 0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100,
+ 0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002,
+ 0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100,
+ 0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101,
+ 0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202,
+ 0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200,
+ 0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100,
+ 0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200,
+ 0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002,
+ 0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001,
+ 0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101,
+ 0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101,
+ 0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202,
+ 0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102,
+ 0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100,
+ 0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101,
+ 0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100,
+ 0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101,
+ 0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101,
+ 0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101,
+ 0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101,
+ 0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102,
+ 0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100,
+ 0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102,
+ 0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101,
+ 0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101,
+ 0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001,
+ 0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101,
+ 0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202,
+ 0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102,
+ 0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001,
+ 0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102,
+ 0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200,
+ 0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101,
+ 0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001,
+ 0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201,
+ 0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202,
+ 0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102,
+ 0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002,
+ 0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200,
+ 0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100,
+ 0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001,
+ 0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002,
+ 0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201,
+ 0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101,
+ 0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100,
+ 0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000,
+ 0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200,
+ 0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101,
+ 0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200,
+ 0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202,
+ 0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100,
+ 0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102,
+ 0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102,
+ 0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102,
+ 0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101,
+ 0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101,
+ 0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000,
+ 0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202,
+ 0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102,
+ 0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200,
+ 0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101,
+ 0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101,
+ 0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100,
+ 0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202,
+ 0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101,
+ 0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201,
+ 0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001,
+ 0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101,
+ 0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200,
+ 0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002,
+ 0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001,
+ 0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000,
+ 0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101,
+ 0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202,
+ 0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100,
+ 0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102,
+ 0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200,
+ 0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101,
+ 0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201,
+ 0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000,
+ 0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202,
+ 0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201,
+ 0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200,
+ 0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002,
+ 0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101,
+ 0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100,
+ 0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001,
+ 0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201,
+ 0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000,
+ 0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102,
+ 0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001,
+ 0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201,
+ 0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100,
+ 0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002,
+ 0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001,
+ 0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101,
+ 0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002,
+ 0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000,
+ 0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101,
+ 0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100,
+ 0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200,
+ 0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200,
+ 0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102,
+ 0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200,
+ 0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002,
+ 0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100,
+ 0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001,
+ 0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001,
+ 0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102,
+ 0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202,
+ 0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202,
+ 0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000,
+ 0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101,
+ 0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202,
+};
+#endif
+
#ifndef HAVE_FANCY_SIMD
const uint64_t keven_signs[128] = {
0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
@@ -2745,6 +3264,92 @@ static void mul_mat_q4_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D
}
}
+template <int nrc_y>
+static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+ GGML_ASSERT(nrc_x%4 == 0);
+ Q8<nrc_y, block_q8_1_x4> q8(info);
+ int nb = n / 32;
+ GGML_ASSERT(nb%4 == 0);
+ __m256i qx[4];
+ __m256 acc[nrc_y] = {};
+ auto m1 = _mm256_set1_epi16(1);
+ auto ms = _mm_set1_epi16(-32768);
+ float d8[8*nrc_y];
+ union { __m256i vec; uint16_t val[16]; } helper;
+ struct aux_iq1_s_r4 {
+ uint8_t qs[16];
+ uint64_t qh;
+ };
+ for (int ix= 0; ix < nrc_x; ix += 4) {
+ auto dptr = (const ggml_half *)((const char *)vx + ix*bx);
+ auto d1 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)dptr));
+ auto x = (const aux_iq1_s_r4 *)(dptr + 4);
+ for (int ib = 0; ib < nb/4; ++ib) {
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ _mm256_storeu_ps(d8 + 8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib].d)));
+ }
+ for (int k = 0; k < 4; ++k) {
+ auto idxh = _mm256_set1_epi64x(x[4*ib+k].qh);
+ auto sas = _mm256_castsi256_si128(idxh);
+ auto scales4 = _mm_and_si128(_mm_srli_epi16(sas, 12), _mm_set1_epi16(7));
+ scales4 = _mm_or_si128(_mm_slli_epi16(scales4, 1), _mm_set1_epi16(1));
+ auto signs = _mm_or_si128(_mm_cmpeq_epi16(_mm_and_si128(sas, ms), ms), _mm256_castsi256_si128(m1));
+ signs = _mm_add_epi16(_mm_set1_epi16(-8), signs);
+ auto delta4 = _mm_mul_ps(_mm_set1_ps(0.0625f), _mm_cvtepi32_ps(_mm_cvtepi16_epi32(
+ _mm_mullo_epi16(scales4, signs))));
+ auto delta = _mm256_set_m128(delta4, delta4);
+ scales4 = _mm_unpacklo_epi16(scales4, scales4); // 0,0, 1,1, 2,2, 3,3
+ auto scales = MM256_SET_M128I(scales4, scales4);
+ auto idxl = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)x[4*ib+k].qs));
+ idxh = _mm256_sllv_epi64(idxh, _mm256_set_epi64x(0, 2, 5, 8));
+ idxh = _mm256_srlv_epi64(idxh, _mm256_set_epi64x(1, 0, 0, 0));
+ helper.vec = _mm256_or_si256(idxl, _mm256_and_si256(_mm256_set1_epi16(0x0700), idxh));
+ qx[0] = _mm256_set_epi64x(iq1s_grid_us[helper.val[ 9]], iq1s_grid_us[helper.val[ 8]],
+ iq1s_grid_us[helper.val[ 1]], iq1s_grid_us[helper.val[ 0]]);
+ qx[1] = _mm256_set_epi64x(iq1s_grid_us[helper.val[13]], iq1s_grid_us[helper.val[12]],
+ iq1s_grid_us[helper.val[ 5]], iq1s_grid_us[helper.val[ 4]]);
+ qx[2] = _mm256_set_epi64x(iq1s_grid_us[helper.val[11]], iq1s_grid_us[helper.val[10]],
+ iq1s_grid_us[helper.val[ 3]], iq1s_grid_us[helper.val[ 2]]);
+ qx[3] = _mm256_set_epi64x(iq1s_grid_us[helper.val[15]], iq1s_grid_us[helper.val[14]],
+ iq1s_grid_us[helper.val[ 7]], iq1s_grid_us[helper.val[ 6]]);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ib].qs + k);
+#ifdef HAVE_FANCY_SIMD
+ // 0,0, 1,1, 0,0, 1,1 as int32_t
+ auto sumi1 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(),
+ qx[0], _mm256_shuffle_epi32(y, 0x44)), qx[1], _mm256_shuffle_epi32(y, 0xee));
+ // 2,2, 3,3, 2,2, 3,3 as int32_t
+ auto sumi2 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(),
+ qx[2], _mm256_shuffle_epi32(y, 0x44)), qx[3], _mm256_shuffle_epi32(y, 0xee));
+ auto sumi = _mm256_packs_epi32(sumi1, sumi2);
+#else
+ // 4 x row 0, 4 x row 1, 4 x row 0, 4 x row 1
+ auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[0], _mm256_shuffle_epi32(y, 0x44)),
+ _mm256_maddubs_epi16(qx[1], _mm256_shuffle_epi32(y, 0xee)));
+ // 4 x row 2, 4 x row 3, 4 x row 2, 4 x row 3
+ auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[2], _mm256_shuffle_epi32(y, 0x44)),
+ _mm256_maddubs_epi16(qx[3], _mm256_shuffle_epi32(y, 0xee)));
+ // 0,0, 1,1, 0,0, 1,1 as int32_t
+ sumi1 = _mm256_madd_epi16(m1, sumi1);
+ // 2,2, 3,3, 2,2, 3,3 as int32_t
+ sumi2 = _mm256_madd_epi16(m1, sumi2);
+ // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3 as int16_t
+ auto sumi = _mm256_packs_epi32(sumi1, sumi2);
+#endif
+ sumi = _mm256_madd_epi16(scales, sumi);
+ acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+0]), _mm256_cvtepi32_ps(sumi), acc[iy]);
+ acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+4]), delta, acc[iy]);
+ }
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto sumf = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
+ info.store(ix, iy, _mm_mul_ps(d1, sumf));
+ acc[iy] = _mm256_setzero_ps();
+ }
+ }
+}
+
#ifdef HAVE_FANCY_SIMD
template <int nrc_y>
static void mul_mat_q4_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
@@ -7042,14 +7647,14 @@ struct Q8_0_x4_Unpacker_512 {
auto scales = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x[i].d));
for (int j = 0; j < 4; ++j) {
qx[j] = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
- qx[j] = _mm256_xor_si256(qx[j], _mm256_set1_epi8(0x80));
+ qx[j] = _mm256_xor_si256(qx[j], _mm256_set1_epi8(-128));
}
return _mm256_set_m128(_mm_mul_ps(scales, min), scales);
}
inline auto set_block(int i) {
auto q8 = (const block_q8_0 *)(x + i);
qx[0] = _mm256_loadu_si256((const __m256i *)q8->qs);
- qx[0] = _mm256_xor_si256(qx[0], _mm256_set1_epi8(0x80));
+ qx[0] = _mm256_xor_si256(qx[0], _mm256_set1_epi8(-128));
float d = GGML_FP16_TO_FP32(q8->d);
return std::make_pair(d, -128.f*d);
}
@@ -8202,6 +8807,21 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
mm.funcs[7] = mul_mat_q8_0_r4_q8_1<8>;
expected_typeB = GGML_TYPE_Q8_1_X4;
break;
+ case GGML_TYPE_IQ1_S_R4:
+ assert (ne00 % QK4_NL == 0);
+ mm.funcs[0] = mul_mat_iq1_s_r4_q8_1<1>;
+ mm.funcs[1] = mul_mat_iq1_s_r4_q8_1<2>;
+ mm.funcs[2] = mul_mat_iq1_s_r4_q8_1<3>;
+ mm.funcs[3] = mul_mat_iq1_s_r4_q8_1<4>;
+ mm.funcs[4] = mul_mat_iq1_s_r4_q8_1<5>;
+ mm.funcs[5] = mul_mat_iq1_s_r4_q8_1<6>;
+ mm.funcs[6] = mul_mat_iq1_s_r4_q8_1<7>;
+ mm.funcs[7] = mul_mat_iq1_s_r4_q8_1<8>;
+#ifdef HAVE_FANCY_SIMD
+ mm.func16 = mul_mat_iq1_s_r4_q8_1<16>;
+#endif
+ expected_typeB = GGML_TYPE_Q8_1_X4;
+ break;
default:
return false;
@@ -11079,6 +11699,78 @@ static void mul_mat_iq2_xs_r4_q8_k(int n, const void * vx, size_t bx, const Data
}
template <int nrc_y>
+static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+ GGML_ASSERT(nrc_x%4 == 0);
+ Q8<nrc_y, block_q8_1_x4> q8(info);
+ int nb = n / 32;
+ GGML_ASSERT(nb%4 == 0);
+ int8x16_t qx[8];
+ int32x4_t acc[nrc_y] = {};
+ auto ms = vdup_n_u16(0x8000);
+ float d8[8*nrc_y];
+ for (int ix= 0; ix < nrc_x; ix += 4) {
+ auto dptr = (const ggml_half *)((const char *)vx + ix*bx);
+ auto d1 = vcvt_f32_f16(vld1_f16((const float16_t *)dptr));
+ auto x = (const block_iq1_s_r4 *)(dptr + 4);
+ for (int ib = 0; ib < nb/4; ++ib) {
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto scales = vld1q_f16((const float16_t *)q8.y[iy][ib].d);
+ vst1q_f32(d8+8*iy+0, vcvt_f32_f16(vget_low_f16(scales)));
+ vst1q_f32(d8+8*iy+4, vcvt_f32_f16(vget_high_f16(scales)));
+ }
+ for (int k = 0; k < 4; ++k) {
+ auto sas = vld1_u16(x[4*ib+k].qh);
+ auto scales4 = vand_u16(vshr_n_u16(sas, 12), vdup_n_u16(7));
+ scales4 = vorr_u16(vshl_n_u16(scales4, 1), vdup_n_u16(1));
+ auto signs = vorr_u16(vceq_u16(vand_u16(sas, ms), ms), vdup_n_u16(1));
+ auto delta4 = vmulq_f32(vdupq_n_f32(IQ1S_DELTA), vcvtq_f32_s32(vmull_s16(signs, scales4)));
+ qx[0] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 0] | ((x[4*ib+k].qh[0] << 8) & 0x0700)],
+ iq1s_grid[x[4*ib+k].qs[ 1] | ((x[4*ib+k].qh[1] << 8) & 0x0700)]});
+ qx[1] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 8] | ((x[4*ib+k].qh[0] << 2) & 0x0700)],
+ iq1s_grid[x[4*ib+k].qs[ 9] | ((x[4*ib+k].qh[1] << 2) & 0x0700)]});
+ qx[2] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 4] | ((x[4*ib+k].qh[0] << 5) & 0x0700)],
+ iq1s_grid[x[4*ib+k].qs[ 5] | ((x[4*ib+k].qh[1] << 5) & 0x0700)]});
+ qx[3] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[12] | ((x[4*ib+k].qh[0] >> 1) & 0x0700)],
+ iq1s_grid[x[4*ib+k].qs[13] | ((x[4*ib+k].qh[1] >> 1) & 0x0700)]});
+ qx[4] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 2] | ((x[4*ib+k].qh[2] << 8) & 0x0700)],
+ iq1s_grid[x[4*ib+k].qs[ 3] | ((x[4*ib+k].qh[3] << 8) & 0x0700)]});
+ qx[5] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[10] | ((x[4*ib+k].qh[2] << 2) & 0x0700)],
+ iq1s_grid[x[4*ib+k].qs[11] | ((x[4*ib+k].qh[3] << 2) & 0x0700)]});
+ qx[6] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 6] | ((x[4*ib+k].qh[2] << 5) & 0x0700)],
+ iq1s_grid[x[4*ib+k].qs[ 7] | ((x[4*ib+k].qh[3] << 5) & 0x0700)]});
+ qx[7] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[14] | ((x[4*ib+k].qh[2] >> 1) & 0x0700)],
+ iq1s_grid[x[4*ib+k].qs[15] | ((x[4*ib+k].qh[3] >> 1) & 0x0700)]});
+ auto scales = vmovl_u16(scales4);
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ auto sumi1 = vdupq_n_s32(0);
+ auto sumi2 = vdupq_n_s32(0);
+ auto y = vld1_s8_x4(q8.y[iy][ib].qs + 32*k);
+ auto y1 = vcombine_s8(y.val[0], y.val[0]);
+ auto y2 = vcombine_s8(y.val[1], y.val[1]);
+ sumi1 = ggml_vdotq_s32(sumi1, qx[0], y1);
+ sumi2 = ggml_vdotq_s32(sumi2, qx[4], y1);
+ sumi1 = ggml_vdotq_s32(sumi1, qx[2], y2);
+ sumi2 = ggml_vdotq_s32(sumi2, qx[6], y2);
+ y1 = vcombine_s8(y.val[2], y.val[2]);
+ y2 = vcombine_s8(y.val[3], y.val[3]);
+ sumi1 = ggml_vdotq_s32(sumi1, qx[1], y1);
+ sumi2 = ggml_vdotq_s32(sumi2, qx[5], y1);
+ sumi1 = ggml_vdotq_s32(sumi1, qx[3], y2);
+ sumi2 = ggml_vdotq_s32(sumi2, qx[7], y2);
+ auto sumi = vmulq_s32(scales, vpaddq_s32(sumi1, sumi2));
+ acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[8*iy+k+0]), vcvtq_f32_s32(sumi));
+ acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[8*iy+k+4]), delta4);
+ }
+ }
+ }
+ for (int iy = 0; iy < nrc_y; ++iy) {
+ info.store(ix, iy, vmulq_f32(d1, acc[iy]));
+ acc[iy] = vdupq_n_f32(0.f);
+ }
+ }
+}
+
+template <int nrc_y>
static void mul_mat_iq2_s_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(nrc_x%4 == 0);
Q8<nrc_y, block_q8_K> q8(info);
@@ -12697,6 +13389,11 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) {
m.func16 = mul_mat_iq2_s_r4_q8_k<16>;
expected_Btype = GGML_TYPE_Q8_K;
break;
+ case GGML_TYPE_IQ1_S_R4:
+ SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq1_s_r4_q8_1);
+ m.func16 = mul_mat_iq1_s_r4_q8_1<16>;
+ expected_Btype = GGML_TYPE_Q8_1_X4;
+ break;
case GGML_TYPE_IQ3_XXS_R4:
SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq3_xxs_r4_q8_k);
m.func16 = mul_mat_iq3_xxs_r4_q8_k<16>;
@@ -12995,7 +13692,7 @@ struct F16 {
using Data = float16x8_t;
constexpr static int block_size = 8;
//constexpr static int num_registers = 32;
- constexpr static int q_step = 8;
+ //constexpr static int q_step = 8;
static inline Data zero() { return vdupq_n_f16(0); }
static inline Data load(const char * ptr, int i) { return vld1q_f16((const float16_t *)ptr + block_size*i); }
static inline Data load(const float16_t * ptr, int i) { return vld1q_f16(ptr + block_size*i); }
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index c1e7771f..a8553b43 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -6087,6 +6087,112 @@ void vec_dot_iq3_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
GGML_UNUSED(by);
}
+void quantize_row_iq1_s_r4_ref(const float * x, block_iq1_s_r4 * y, int64_t k) {
+ quantize_iq1_s_r4(x, y, 4, k/4, nullptr);
+}
+
+void quantize_row_iq1_s_r4(const float * x, void * y, int64_t k) {
+ quantize_iq1_s_r4(x, y, 4, k/4, nullptr);
+}
+
+size_t quantize_iq1_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+ constexpr int kBlockSize = 32;
+ GGML_ASSERT(nrows%4 == 0);
+ GGML_ASSERT(n_per_row%kBlockSize == 0);
+ int nblock = n_per_row/kBlockSize;
+ float weight[kBlockSize];
+ int8_t L[kBlockSize];
+ float pairs[2*kBlockSize];
+ float sumx[kBlockSize+1], sumw[kBlockSize+1];
+ float max[4];
+ uint16_t index[4];
+ int shift;
+ float invd[4];
+ std::vector<float> scales(4*nblock);
+ auto row_size = ggml_row_size(GGML_TYPE_IQ1_S_R4, n_per_row);
+ char * cy = (char *)dst;
+ for (int row = 0; row < nrows; row += 4) {
+ ggml_half * dptr = (ggml_half *)cy;
+ auto y = (block_iq1_s_r4 *)(dptr + 4);
+ for (int k = 0; k < 4; ++k) max[k] = 0;
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ if (imatrix) {
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = imatrix[kBlockSize*ibl + j];
+ }
+ for (int k = 0; k < 4; ++k) {
+ auto xb = src + k*n_per_row + kBlockSize*ibl;
+ float sumx2 = 0;
+ for (int j = 0; j < kBlockSize; ++j) sumx2 += xb[j]*xb[j];
+ float sigma2 = 1.5f*sumx2/kBlockSize;
+ if (imatrix) {
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = imatrix[kBlockSize*ibl + j]*sqrt(sigma2 + xb[j]*xb[j]);
+ } else {
+ for (int j = 0; j < kBlockSize; ++j) weight[j] = sqrt(sigma2 + xb[j]*xb[j]);
+ }
+ iq1s_process_1block(kBlockSize, xb, weight, L, scales.data() + 4*ibl + k, index, &shift, pairs, sumx, sumw);
+ max[k] = std::max(max[k], scales[4*ibl+k]);
+ uint16_t h = 0;
+ for (int i = 0; i < 4; ++i) {
+ y[ibl].qs[4*i + k] = index[i] & 255;
+ h |= (index[i] >> 8) << 3*i;
+ }
+ if (shift < 0) h |= 0x8000;
+ y[ibl].qh[k] = h;
+ }
+ }
+ for (int k = 0; k < 4; ++k) {
+ dptr[k] = GGML_FP32_TO_FP16(1.0625f*max[k]/15);;
+ invd[k] = max[k] ? 15/max[k] : 0.f;
+ }
+ for (int ibl = 0; ibl < nblock; ++ibl) {
+ for (int k = 0; k < 4; ++k) {
+ int ls = nearest_int(0.5f*(scales[4*ibl+k]*invd[k] - 1));
+ ls = std::max(0, std::min(7, ls));
+ y[ibl].qh[k] |= (ls << 12);
+ }
+ }
+ cy += 4*row_size;
+ src += 4*n_per_row;
+ }
+ return nrows*row_size;
+}
+
+void dequantize_row_iq1_s_r4(const block_iq1_s_r4 * x, float * y, int64_t n) {
+ auto dptr = (const ggml_half *)x;
+ x = (const block_iq1_s_r4 *)(dptr + 4);
+ float d[4];
+ for (int k = 0; k < 4; ++k) d[k] = GGML_FP16_TO_FP32(dptr[k]);
+ int n_per_row = n/4;
+ GGML_ASSERT(n_per_row%32 == 0);
+ int nblock = n_per_row/32;
+ float * yk[4];
+ for (int k = 0; k < 4; ++k) yk[k] = y + k*n_per_row;
+ for (int ib = 0; ib < nblock; ++ib) {
+ for (int k = 0; k < 4; ++k) {
+ float shift = x[ib].qh[k] & 0x8000 ? -IQ1S_DELTA : IQ1S_DELTA;
+ float dl = d[k]*(2*((x[ib].qh[k] >> 12) & 7) + 1);
+ for (int i = 0; i < 4; ++i) {
+ auto idx = x[ib].qs[4*i+k] | (((x[ib].qh[k] >> 3*i) & 7) << 8);
+ auto grid = (const int8_t *)(iq1s_grid + idx);
+ for (int j = 0; j < 8; ++j) yk[k][32*ib + 8*i + j] = dl*(grid[j] + shift);
+ }
+ }
+ }
+}
+
+void vec_dot_iq1_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+#if GGML_USE_IQK_MULMAT
+ if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ1_S_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
+ return;
+ }
+#endif
+ GGML_ASSERT(n%QK4_NL == 0);
+ GGML_ASSERT(nrc == 1);
+ GGML_UNUSED(bs);
+ GGML_UNUSED(bx);
+ GGML_UNUSED(by);
+}
+
//================================================
namespace {
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index 1a991787..9a3c5dc6 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -199,6 +199,12 @@ size_t quantize_iq3_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT d
void dequantize_row_iq3_s_r4(const block_iq3_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void vec_dot_iq3_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void quantize_row_iq1_s_r4_ref(const float * GGML_RESTRICT x, block_iq1_s_r4 * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq1_s_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+size_t quantize_iq1_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+void dequantize_row_iq1_s_r4(const block_iq1_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void vec_dot_iq1_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
void quantize_row_q8_k_r8_ref(const float * GGML_RESTRICT x, block_q8_k_r8 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_k_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
diff --git a/include/llama.h b/include/llama.h
index c21671c6..0f6d15ac 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -192,6 +192,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 = 219, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 = 220, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 = 223, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_IQ1_S_R4 = 224, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 = 225, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ3_S_R4 = 226, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_M_R4 = 229, // except 1d tensors
diff --git a/src/llama.cpp b/src/llama.cpp
index 570c056c..943b945a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3954,6 +3954,7 @@ struct llama_model_loader {
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
case GGML_TYPE_IQ3_XXS_R4: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4; break;
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
+ case GGML_TYPE_IQ1_S_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ1_S_R4;break;
case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break;
case GGML_TYPE_IQ2_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN; break;
@@ -4688,6 +4689,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: return "IQ3_XXS_R4 - 3.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ1_S_R4: return "IQ1_S_R4 - 1.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_NL_R4:return "IQ4_NL_R4 - 4.5 bpw";
@@ -15966,7 +15968,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4 ||
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) {
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) {
new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) {
@@ -15987,7 +15990,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) {
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 ||
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) {
new_type = GGML_TYPE_Q2_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) {
@@ -16064,6 +16068,41 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
new_type = GGML_TYPE_BF16;
}
}
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) {
+ if (name.find("attn_v.weight") != std::string::npos) {
+ if (qs.model.hparams.n_expert >= 4 || qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ4_K_R4;
+ else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_K_R4;
+ else new_type = GGML_TYPE_Q2_K_R4;
+ ++qs.i_attention_wv;
+ }
+ else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
+ new_type = GGML_TYPE_Q4_K_R4;
+ }
+ else if (qs.model.hparams.n_expert >= 8 && (name.find("blk.0.ffn_down") != std::string::npos ||
+ name.find("blk.0.ffn_gate") != std::string::npos ||
+ name.find("blk.0.ffn_up") != std::string::npos)) {
+ new_type = GGML_TYPE_IQ3_K_R4;
+ }
+ else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
+ new_type = GGML_TYPE_Q4_K_R4;
+ }
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
+ new_type = GGML_TYPE_IQ2_K_R4;
+ }
+ else if (name.find("_shexp.weight") != std::string::npos) {
+ new_type = GGML_TYPE_IQ4_K_R4;
+ }
+ else if (name.find("ffn_down") != std::string::npos) {
+ auto [i_layer, n_layer] = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+ if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;
+ else if (i_layer < n_layer/8) {
+ new_type = GGML_TYPE_Q2_K_R4;
+ }
+ ++qs.i_ffn_down;
+ }
+ else if (name.find("attn_output.weight") != std::string::npos) {
+ new_type = qs.model.hparams.n_expert >= 4 ? GGML_TYPE_Q5_K_R4 : GGML_TYPE_IQ2_K_R4;
+ }
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 ||
@@ -16095,6 +16134,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
new_type = GGML_TYPE_Q5_K;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) new_type = GGML_TYPE_IQ2_K_R4;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m) new_type = GGML_TYPE_IQ3_S;
}
}
@@ -16539,6 +16579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: default_type = GGML_TYPE_IQ3_XXS_R4; break;
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
+ case LLAMA_FTYPE_MOSTLY_IQ1_S_R4:default_type = GGML_TYPE_IQ1_S_R4;break;
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
case LLAMA_FTYPE_MOSTLY_IQ1_BN: default_type = GGML_TYPE_IQ1_BN; break;
case LLAMA_FTYPE_MOSTLY_IQ2_BN: default_type = GGML_TYPE_IQ2_BN; break;
@@ -16892,6 +16933,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_type == GGML_TYPE_IQ2_S ||
new_type == GGML_TYPE_IQ2_S_R4||
new_type == GGML_TYPE_IQ1_S ||
+ new_type == GGML_TYPE_IQ1_S_R4||
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
LLAMA_LOG_ERROR("\n\n============================================================\n");
@@ -17011,6 +17053,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_S;
else chunk_size_multiplier = 4;
}
+ else if (new_type == GGML_TYPE_IQ1_S_R4) {
+ if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S;
+ else chunk_size_multiplier = 4;
+ }
else if (new_type == GGML_TYPE_BF16_R16) {
if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16;
else chunk_size_multiplier = 16;