diff options
-rw-r--r-- | examples/quantize/quantize.cpp | 2 | ||||
-rw-r--r-- | ggml/include/ggml.h | 2 | ||||
-rw-r--r-- | ggml/src/ggml-common.h | 6 | ||||
-rw-r--r-- | ggml/src/ggml-quants.c | 289 | ||||
-rw-r--r-- | ggml/src/ggml-quants.h | 5 | ||||
-rw-r--r-- | ggml/src/ggml.c | 27 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_mul_mat.cpp | 703 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 106 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.h | 6 | ||||
-rw-r--r-- | include/llama.h | 1 | ||||
-rw-r--r-- | src/llama.cpp | 50 |
11 files changed, 1104 insertions, 93 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 5ffdbc84..1c847e6b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -29,6 +29,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ2_M_R4", LLAMA_FTYPE_MOSTLY_IQ2_M_R4, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, + { "IQ1_S_R4", LLAMA_FTYPE_MOSTLY_IQ1_S_R4, " 1.5 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", }, { "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", }, @@ -510,6 +511,7 @@ int main(int argc, char ** argv) { params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) { fprintf(stderr, "\n==========================================================================================================\n"); fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 5eea7dcd..9668dc32 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -427,6 +427,7 @@ extern "C" { GGML_TYPE_IQ2_XXS_R4= 216, GGML_TYPE_IQ2_XS_R4 = 217, GGML_TYPE_IQ3_XXS_R4= 218, + GGML_TYPE_IQ1_S_R4 = 219, GGML_TYPE_IQ4_NL_R4 = 220, GGML_TYPE_IQ3_S_R4 = 221, GGML_TYPE_IQ2_S_R4 = 222, @@ -510,6 +511,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ2_XXS_R4= 215, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS_R4 = 216, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS_R4= 217, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ1_S_R4 = 218, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_NL_R4 = 219, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_S_R4 = 220, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_S_R4 = 221, // except 1d tensors diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 023b0b63..14813161 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -485,6 +485,12 @@ typedef struct { } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); +typedef struct { + uint8_t qs[16]; + uint16_t qh[4]; +} block_iq1_s_r4; +static_assert(sizeof(block_iq1_s_r4) == 24, "wrong iq1_s_r4 block size/padding"); + // 1.75 bpw typedef struct { uint8_t qs[QK_K/8]; // grid index, low 8 bits diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 391d9e2e..3c4711f3 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -13991,6 +13991,105 @@ static int iq1_sort_helper(const void * left, const void * right) { return *l < *r ? -1 : *l > *r ? 1 : 0; } +void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L, float * the_scale, uint16_t * the_index, int * the_shift, + float * pairs, float * sumx, float * sumw) { + float max = fabsf(xb[0]); + for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); + if (max < GROUP_MAX_EPS_IQ1_S) { + *the_scale = 0; + *the_shift = 1; + for (int k = 0; k < block_size/8; ++k) the_index[k] = 0; + return; + } + const int gindex = iq2_data_index(GGML_TYPE_IQ1_S); + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); + + const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA}; + const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA}; + + // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. + // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two + // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights + // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and + // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale + // for each possible and score for each split. + int * idx = (int *)(pairs + 1); + for (int j = 0; j < block_size; ++j) { + pairs[2*j] = xb[j]; + idx[2*j] = j; + } + qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); + { + sumx[0] = sumw[0] = 0; + for (int j = 0; j < block_size; ++j) { + int i = idx[2*j]; + sumx[j+1] = sumx[j] + weight[i]*xb[i]; + sumw[j+1] = sumw[j] + weight[i]; + } + } + float best_score = -FLT_MIN, scale = max; + int besti1 = -1, besti2 = -1, best_shift = 0; + for (int i1 = 0; i1 <= block_size; ++i1) { + for (int i2 = i1; i2 <= block_size; ++i2) { + float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; + float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; + if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + scale = sumqx/sumq2; best_score = scale*sumqx; + besti1 = i1; besti2 = i2; best_shift = 1; + } + sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; + sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; + if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + scale = sumqx/sumq2; best_score = scale*sumqx; + besti1 = i1; besti2 = i2; best_shift = -1; + } + } + } + GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); + for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; + for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; + for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; + if (scale < 0) { + for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; + scale = -scale; best_shift = -best_shift; + } + bool all_on_grid = true; + const float * xx = best_shift == 1 ? x_p : x_m; + for (int k = 0; k < block_size/8; ++k) { + uint16_t u = 0; + for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + all_on_grid = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); + GGML_ASSERT(grid_index >= 0); + } + the_index[k] = grid_index; + } + if (!all_on_grid) { + float sumqx = 0, sumq2 = 0; + for (int k = 0; k < block_size/8; ++k) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + the_index[k]); + for (int j = 0; j < 8; ++j) { + float w = weight[8*k + j]; + float q = xx[(pg[j] - 1)/2]; + sumqx += w*q*xb[8*k+j]; + sumq2 += w*q*q; + } + } + if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; + } + *the_scale = scale; + *the_shift = best_shift; +} + #define IQ1S_BLOCK_SIZE 32 #define IQ1M_BLOCK_SIZE 16 static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, @@ -14021,11 +14120,10 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy const int block_size = IQ1S_BLOCK_SIZE; - const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA}; - const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA}; - + //const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA}; + //const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA}; - int * idx = (int *)(pairs + 1); + //int * idx = (int *)(pairs + 1); for (int ibl = 0; ibl < nbl; ++ibl) { @@ -14044,95 +14142,100 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy const float * xb = xbl + block_size*ib; const float * qw = quant_weights + QK_K*ibl + block_size*ib; for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); - float max = fabsf(xb[0]); - for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); - if (max < GROUP_MAX_EPS_IQ1_S) { - scales[ib] = 0; - memset(L, 1, block_size); - continue; - } - // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. - // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two - // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights - // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and - // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale - // for each possible and score for each split. - for (int j = 0; j < block_size; ++j) { - pairs[2*j] = xb[j]; - idx[2*j] = j; - } - qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); - { - sumx[0] = sumw[0] = 0; - for (int j = 0; j < block_size; ++j) { - int i = idx[2*j]; - sumx[j+1] = sumx[j] + weight[i]*xb[i]; - sumw[j+1] = sumw[j] + weight[i]; - } - } - float best_score = -FLT_MIN, scale = max; - int besti1 = -1, besti2 = -1, best_shift = 0; - for (int i1 = 0; i1 <= block_size; ++i1) { - for (int i2 = i1; i2 <= block_size; ++i2) { - float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; - float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; - if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { - scale = sumqx/sumq2; best_score = scale*sumqx; - besti1 = i1; besti2 = i2; best_shift = 1; - } - sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; - sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; - if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { - scale = sumqx/sumq2; best_score = scale*sumqx; - besti1 = i1; besti2 = i2; best_shift = -1; - } - } - } - GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); - for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; - for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; - for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; - if (scale < 0) { - for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; - scale = -scale; best_shift = -best_shift; - } - bool all_on_grid = true; - const float * xx = best_shift == 1 ? x_p : x_m; - for (int k = 0; k < block_size/8; ++k) { - uint16_t u = 0; - for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); - int grid_index = kmap_q2xs[u]; - if (grid_index < 0) { - all_on_grid = false; - const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; - grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); - GGML_ASSERT(grid_index >= 0); - } - index[k] = grid_index; - } - if (!all_on_grid) { - float sumqx = 0, sumq2 = 0; - for (int k = 0; k < block_size/8; ++k) { - const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); - for (int j = 0; j < 8; ++j) { - float w = weight[8*k + j]; - float q = xx[(pg[j] - 1)/2]; - sumqx += w*q*xb[8*k+j]; - sumq2 += w*q*q; - } - } - if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; - } + int best_shift; + iq1s_process_1block(block_size, xb, weight, L, &scales[ib], index, &best_shift, pairs, sumx, sumw); + +// float max = fabsf(xb[0]); +// for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); +// if (max < GROUP_MAX_EPS_IQ1_S) { +// scales[ib] = 0; +// memset(L, 1, block_size); +// continue; +// } +// // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. +// // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two +// // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights +// // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and +// // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale +// // for each possible and score for each split. +// for (int j = 0; j < block_size; ++j) { +// pairs[2*j] = xb[j]; +// idx[2*j] = j; +// } +// qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); +// { +// sumx[0] = sumw[0] = 0; +// for (int j = 0; j < block_size; ++j) { +// int i = idx[2*j]; +// sumx[j+1] = sumx[j] + weight[i]*xb[i]; +// sumw[j+1] = sumw[j] + weight[i]; +// } +// } +// float best_score = -FLT_MIN, scale = max; +// int besti1 = -1, besti2 = -1, best_shift = 0; +// for (int i1 = 0; i1 <= block_size; ++i1) { +// for (int i2 = i1; i2 <= block_size; ++i2) { +// float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; +// float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; +// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { +// scale = sumqx/sumq2; best_score = scale*sumqx; +// besti1 = i1; besti2 = i2; best_shift = 1; +// } +// sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; +// sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; +// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { +// scale = sumqx/sumq2; best_score = scale*sumqx; +// besti1 = i1; besti2 = i2; best_shift = -1; +// } +// } +// } +// GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); +// for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; +// for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; +// for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; +// if (scale < 0) { +// for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; +// scale = -scale; best_shift = -best_shift; +// } +// bool all_on_grid = true; +// const float * xx = best_shift == 1 ? x_p : x_m; +// for (int k = 0; k < block_size/8; ++k) { +// uint16_t u = 0; +// for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); +// int grid_index = kmap_q2xs[u]; +// if (grid_index < 0) { +// all_on_grid = false; +// const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; +// grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); +// GGML_ASSERT(grid_index >= 0); +// } +// index[k] = grid_index; +// } +// if (!all_on_grid) { +// float sumqx = 0, sumq2 = 0; +// for (int k = 0; k < block_size/8; ++k) { +// const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); +// for (int j = 0; j < 8; ++j) { +// float w = weight[8*k + j]; +// float q = xx[(pg[j] - 1)/2]; +// sumqx += w*q*xb[8*k+j]; +// sumq2 += w*q*q; +// } +// } +// if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; +// } uint16_t h = 0; for (int k = 0; k < block_size/8; ++k) { y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255; h |= (index[k] >> 8) << 3*k; } y[ibl].qh[ib] = h; - GGML_ASSERT(scale >= 0); - scales[ib] = scale; + GGML_ASSERT(scales[ib] >= 0); + max_scale = MAX(max_scale, scales[ib]); + //GGML_ASSERT(scale >= 0); + //scales[ib] = scale; shifts[ib] = best_shift; - max_scale = MAX(max_scale, scale); + //max_scale = MAX(max_scale, scale); } if (!max_scale) { @@ -14171,6 +14274,19 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n return nrow * nblock * sizeof(block_iq1_s); } +void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k) { + int nblock = k/QK_K; + float qw[QK_K]; + for (int j = 0; j < QK_K; ++j) qw[j] = 1; + for (int ibl = 0; ibl < nblock; ++ibl) { + quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw); + } +} + +void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_iq1_s_ref(x, (block_iq1_s *)y, k); +} + static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, float * scales, float * weight, @@ -15129,6 +15245,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ3_XXS_R4: break; case GGML_TYPE_IQ3_S_R4: break; case GGML_TYPE_IQ2_S_R4: break; + case GGML_TYPE_IQ1_S_R4: break; case GGML_TYPE_Q4_0_R4: break; case GGML_TYPE_Q5_0_R4: break; case GGML_TYPE_Q6_0_R4: break; diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index b6d69011..4753f342 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -42,6 +42,7 @@ void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGM void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k); +void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -66,6 +67,7 @@ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -148,6 +150,9 @@ void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); +void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L, + float * the_scale, uint16_t * the_index, int * the_shift, float * pairs, float * sumx, float * sumw); + #if defined(__ARM_FEATURE_SVE) extern int ggml_sve_cnt_b; #endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b3c8a951..64b7d3ce 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1176,13 +1176,26 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_iq1_s), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq1_s, - .from_float = NULL, - .from_float_ref = NULL, + .from_float = quantize_row_iq1_s, + .from_float_ref = (ggml_from_float_t)quantize_row_iq1_s_ref, .vec_dot = ggml_vec_dot_iq1_s_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, .row_meta_size = 0, }, + [GGML_TYPE_IQ1_S_R4] = { + .type_name = "iq1_s_r4", + .blck_size = 32, + .type_size = sizeof(block_iq1_s_r4)/4, + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_s_r4, + .from_float = quantize_row_iq1_s_r4, + .from_float_ref = (ggml_from_float_t)quantize_row_iq1_s_r4_ref, + .vec_dot = vec_dot_iq1_s_r4_q8_k, + .vec_dot_type = GGML_TYPE_Q8_1_X4, + .nrows = 1, + .row_meta_size = 2, + }, [GGML_TYPE_IQ1_M] = { .type_name = "iq1_m", .blck_size = QK_K, @@ -4387,6 +4400,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ3_S_R4: wtype = GGML_TYPE_IQ3_S_R4; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; case GGML_FTYPE_MOSTLY_IQ2_S_R4: wtype = GGML_TYPE_IQ2_S_R4; break; + case GGML_FTYPE_MOSTLY_IQ1_S_R4: wtype = GGML_TYPE_IQ1_S_R4; break; case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; @@ -10934,6 +10948,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -11402,6 +11417,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -11567,6 +11583,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -14805,6 +14822,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -15210,6 +15228,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -15509,6 +15528,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -16137,6 +16157,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_Q8_K: case GGML_TYPE_Q8_K64: case GGML_TYPE_Q8_K16: @@ -22893,6 +22914,7 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break; + case GGML_TYPE_IQ1_S_R4:iq2xs_init_impl(GGML_TYPE_IQ1_S); break; case GGML_TYPE_IQ3_XXS_R4: case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; case GGML_TYPE_IQ3_S_R4: @@ -22975,6 +22997,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ3_S_R4:result = quantize_iq3_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index f633229d..559cff05 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -259,6 +259,7 @@ struct MulMat { case GGML_TYPE_IQ2_XS_R4: case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ3_XXS_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_IQ3_S_R4: return 4; case GGML_TYPE_IQ4_NL_R4: case GGML_TYPE_Q5_0_R4: @@ -293,6 +294,7 @@ struct MulMat { case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ3_XXS_R4: case GGML_TYPE_IQ3_S_R4: + case GGML_TYPE_IQ1_S_R4: case GGML_TYPE_IQ2_BN_R4: return 4; case GGML_TYPE_IQ4_XS_R4: case GGML_TYPE_Q4_0_R4: @@ -375,6 +377,523 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) { aux32[0] = a0 & 0x3f3f3f3f; } +#ifdef __AVX2__ +static const uint64_t iq1s_grid_us[2048] = { + 0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200, + 0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000, + 0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101, + 0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101, + 0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202, + 0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200, + 0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001, + 0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202, + 0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201, + 0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001, + 0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101, + 0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101, + 0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202, + 0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200, + 0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201, + 0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002, + 0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101, + 0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200, + 0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102, + 0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101, + 0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001, + 0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100, + 0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200, + 0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101, + 0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100, + 0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000, + 0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202, + 0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200, + 0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101, + 0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201, + 0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002, + 0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001, + 0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001, + 0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002, + 0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000, + 0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101, + 0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000, + 0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101, + 0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202, + 0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201, + 0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000, + 0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100, + 0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102, + 0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002, + 0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000, + 0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101, + 0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101, + 0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200, + 0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002, + 0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001, + 0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101, + 0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101, + 0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101, + 0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102, + 0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100, + 0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002, + 0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100, + 0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000, + 0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101, + 0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101, + 0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001, + 0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102, + 0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201, + 0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202, + 0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001, + 0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001, + 0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101, + 0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102, + 0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200, + 0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101, + 0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101, + 0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000, + 0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201, + 0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101, + 0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202, + 0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102, + 0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101, + 0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100, + 0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002, + 0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201, + 0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101, + 0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002, + 0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202, + 0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101, + 0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000, + 0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100, + 0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102, + 0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102, + 0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101, + 0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101, + 0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001, + 0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201, + 0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002, + 0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001, + 0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100, + 0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101, + 0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001, + 0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101, + 0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000, + 0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001, + 0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101, + 0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101, + 0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000, + 0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001, + 0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001, + 0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102, + 0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102, + 0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101, + 0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201, + 0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202, + 0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202, + 0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101, + 0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001, + 0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000, + 0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101, + 0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200, + 0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100, + 0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100, + 0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202, + 0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102, + 0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201, + 0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202, + 0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002, + 0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001, + 0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001, + 0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101, + 0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202, + 0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201, + 0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102, + 0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200, + 0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001, + 0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101, + 0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201, + 0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001, + 0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002, + 0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000, + 0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202, + 0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201, + 0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201, + 0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101, + 0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100, + 0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000, + 0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101, + 0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202, + 0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101, + 0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202, + 0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202, + 0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201, + 0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002, + 0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102, + 0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102, + 0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000, + 0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000, + 0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101, + 0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101, + 0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202, + 0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200, + 0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102, + 0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101, + 0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100, + 0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001, + 0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100, + 0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101, + 0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001, + 0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200, + 0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101, + 0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101, + 0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100, + 0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101, + 0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101, + 0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101, + 0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202, + 0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100, + 0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201, + 0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202, + 0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102, + 0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200, + 0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201, + 0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000, + 0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002, + 0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100, + 0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000, + 0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100, + 0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000, + 0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102, + 0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100, + 0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002, + 0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001, + 0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201, + 0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202, + 0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100, + 0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001, + 0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002, + 0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001, + 0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201, + 0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001, + 0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101, + 0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101, + 0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101, + 0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101, + 0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102, + 0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100, + 0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001, + 0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000, + 0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001, + 0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101, + 0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100, + 0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000, + 0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202, + 0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101, + 0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100, + 0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100, + 0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200, + 0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100, + 0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101, + 0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101, + 0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201, + 0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001, + 0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201, + 0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201, + 0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001, + 0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200, + 0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100, + 0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201, + 0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200, + 0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101, + 0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001, + 0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102, + 0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001, + 0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201, + 0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100, + 0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000, + 0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102, + 0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001, + 0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202, + 0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102, + 0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101, + 0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201, + 0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101, + 0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102, + 0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101, + 0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100, + 0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202, + 0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101, + 0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202, + 0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101, + 0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200, + 0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101, + 0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100, + 0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002, + 0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201, + 0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100, + 0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202, + 0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102, + 0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002, + 0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200, + 0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002, + 0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200, + 0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001, + 0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200, + 0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100, + 0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000, + 0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102, + 0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100, + 0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000, + 0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102, + 0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100, + 0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000, + 0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101, + 0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001, + 0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201, + 0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002, + 0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200, + 0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100, + 0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101, + 0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202, + 0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002, + 0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201, + 0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201, + 0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001, + 0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202, + 0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102, + 0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002, + 0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201, + 0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200, + 0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002, + 0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100, + 0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101, + 0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102, + 0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002, + 0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200, + 0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100, + 0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001, + 0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100, + 0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201, + 0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101, + 0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102, + 0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201, + 0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200, + 0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200, + 0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002, + 0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202, + 0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102, + 0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000, + 0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202, + 0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201, + 0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001, + 0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002, + 0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102, + 0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001, + 0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101, + 0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202, + 0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102, + 0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201, + 0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101, + 0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101, + 0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001, + 0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202, + 0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000, + 0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202, + 0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102, + 0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002, + 0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201, + 0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101, + 0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001, + 0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200, + 0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102, + 0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102, + 0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100, + 0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001, + 0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201, + 0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001, + 0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202, + 0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200, + 0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000, + 0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000, + 0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001, + 0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200, + 0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200, + 0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202, + 0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201, + 0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202, + 0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001, + 0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001, + 0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200, + 0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000, + 0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102, + 0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101, + 0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100, + 0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000, + 0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100, + 0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100, + 0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102, + 0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201, + 0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202, + 0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102, + 0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102, + 0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202, + 0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202, + 0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100, + 0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000, + 0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101, + 0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202, + 0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102, + 0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100, + 0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101, + 0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100, + 0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201, + 0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101, + 0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202, + 0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200, + 0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201, + 0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200, + 0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002, + 0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201, + 0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101, + 0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201, + 0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201, + 0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102, + 0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101, + 0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101, + 0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101, + 0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001, + 0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000, + 0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102, + 0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101, + 0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202, + 0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202, + 0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101, + 0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000, + 0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101, + 0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202, + 0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100, + 0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000, + 0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101, + 0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202, + 0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100, + 0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100, + 0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002, + 0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100, + 0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101, + 0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202, + 0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200, + 0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100, + 0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200, + 0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002, + 0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001, + 0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101, + 0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101, + 0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202, + 0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102, + 0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100, + 0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101, + 0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100, + 0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101, + 0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101, + 0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101, + 0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101, + 0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102, + 0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100, + 0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102, + 0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101, + 0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101, + 0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001, + 0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101, + 0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202, + 0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102, + 0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001, + 0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102, + 0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200, + 0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101, + 0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001, + 0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201, + 0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202, + 0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102, + 0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002, + 0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200, + 0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100, + 0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001, + 0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002, + 0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201, + 0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101, + 0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100, + 0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000, + 0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200, + 0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101, + 0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200, + 0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202, + 0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100, + 0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102, + 0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102, + 0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102, + 0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101, + 0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101, + 0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000, + 0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202, + 0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102, + 0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200, + 0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101, + 0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101, + 0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100, + 0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202, + 0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101, + 0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201, + 0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001, + 0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101, + 0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200, + 0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002, + 0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001, + 0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000, + 0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101, + 0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202, + 0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100, + 0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102, + 0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200, + 0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101, + 0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201, + 0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000, + 0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202, + 0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201, + 0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200, + 0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002, + 0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101, + 0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100, + 0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001, + 0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201, + 0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000, + 0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102, + 0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001, + 0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201, + 0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100, + 0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002, + 0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001, + 0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101, + 0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002, + 0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000, + 0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101, + 0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100, + 0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200, + 0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200, + 0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102, + 0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200, + 0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002, + 0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100, + 0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001, + 0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001, + 0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102, + 0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202, + 0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202, + 0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000, + 0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101, + 0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202, +}; +#endif + #ifndef HAVE_FANCY_SIMD const uint64_t keven_signs[128] = { 0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff, @@ -2745,6 +3264,92 @@ static void mul_mat_q4_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D } } +template <int nrc_y> +static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(nrc_x%4 == 0); + Q8<nrc_y, block_q8_1_x4> q8(info); + int nb = n / 32; + GGML_ASSERT(nb%4 == 0); + __m256i qx[4]; + __m256 acc[nrc_y] = {}; + auto m1 = _mm256_set1_epi16(1); + auto ms = _mm_set1_epi16(-32768); + float d8[8*nrc_y]; + union { __m256i vec; uint16_t val[16]; } helper; + struct aux_iq1_s_r4 { + uint8_t qs[16]; + uint64_t qh; + }; + for (int ix= 0; ix < nrc_x; ix += 4) { + auto dptr = (const ggml_half *)((const char *)vx + ix*bx); + auto d1 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)dptr)); + auto x = (const aux_iq1_s_r4 *)(dptr + 4); + for (int ib = 0; ib < nb/4; ++ib) { + for (int iy = 0; iy < nrc_y; ++iy) { + _mm256_storeu_ps(d8 + 8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib].d))); + } + for (int k = 0; k < 4; ++k) { + auto idxh = _mm256_set1_epi64x(x[4*ib+k].qh); + auto sas = _mm256_castsi256_si128(idxh); + auto scales4 = _mm_and_si128(_mm_srli_epi16(sas, 12), _mm_set1_epi16(7)); + scales4 = _mm_or_si128(_mm_slli_epi16(scales4, 1), _mm_set1_epi16(1)); + auto signs = _mm_or_si128(_mm_cmpeq_epi16(_mm_and_si128(sas, ms), ms), _mm256_castsi256_si128(m1)); + signs = _mm_add_epi16(_mm_set1_epi16(-8), signs); + auto delta4 = _mm_mul_ps(_mm_set1_ps(0.0625f), _mm_cvtepi32_ps(_mm_cvtepi16_epi32( + _mm_mullo_epi16(scales4, signs)))); + auto delta = _mm256_set_m128(delta4, delta4); + scales4 = _mm_unpacklo_epi16(scales4, scales4); // 0,0, 1,1, 2,2, 3,3 + auto scales = MM256_SET_M128I(scales4, scales4); + auto idxl = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)x[4*ib+k].qs)); + idxh = _mm256_sllv_epi64(idxh, _mm256_set_epi64x(0, 2, 5, 8)); + idxh = _mm256_srlv_epi64(idxh, _mm256_set_epi64x(1, 0, 0, 0)); + helper.vec = _mm256_or_si256(idxl, _mm256_and_si256(_mm256_set1_epi16(0x0700), idxh)); + qx[0] = _mm256_set_epi64x(iq1s_grid_us[helper.val[ 9]], iq1s_grid_us[helper.val[ 8]], + iq1s_grid_us[helper.val[ 1]], iq1s_grid_us[helper.val[ 0]]); + qx[1] = _mm256_set_epi64x(iq1s_grid_us[helper.val[13]], iq1s_grid_us[helper.val[12]], + iq1s_grid_us[helper.val[ 5]], iq1s_grid_us[helper.val[ 4]]); + qx[2] = _mm256_set_epi64x(iq1s_grid_us[helper.val[11]], iq1s_grid_us[helper.val[10]], + iq1s_grid_us[helper.val[ 3]], iq1s_grid_us[helper.val[ 2]]); + qx[3] = _mm256_set_epi64x(iq1s_grid_us[helper.val[15]], iq1s_grid_us[helper.val[14]], + iq1s_grid_us[helper.val[ 7]], iq1s_grid_us[helper.val[ 6]]); + for (int iy = 0; iy < nrc_y; ++iy) { + auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ib].qs + k); +#ifdef HAVE_FANCY_SIMD + // 0,0, 1,1, 0,0, 1,1 as int32_t + auto sumi1 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), + qx[0], _mm256_shuffle_epi32(y, 0x44)), qx[1], _mm256_shuffle_epi32(y, 0xee)); + // 2,2, 3,3, 2,2, 3,3 as int32_t + auto sumi2 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), + qx[2], _mm256_shuffle_epi32(y, 0x44)), qx[3], _mm256_shuffle_epi32(y, 0xee)); + auto sumi = _mm256_packs_epi32(sumi1, sumi2); +#else + // 4 x row 0, 4 x row 1, 4 x row 0, 4 x row 1 + auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[0], _mm256_shuffle_epi32(y, 0x44)), + _mm256_maddubs_epi16(qx[1], _mm256_shuffle_epi32(y, 0xee))); + // 4 x row 2, 4 x row 3, 4 x row 2, 4 x row 3 + auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(qx[2], _mm256_shuffle_epi32(y, 0x44)), + _mm256_maddubs_epi16(qx[3], _mm256_shuffle_epi32(y, 0xee))); + // 0,0, 1,1, 0,0, 1,1 as int32_t + sumi1 = _mm256_madd_epi16(m1, sumi1); + // 2,2, 3,3, 2,2, 3,3 as int32_t + sumi2 = _mm256_madd_epi16(m1, sumi2); + // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3 as int16_t + auto sumi = _mm256_packs_epi32(sumi1, sumi2); +#endif + sumi = _mm256_madd_epi16(scales, sumi); + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+0]), _mm256_cvtepi32_ps(sumi), acc[iy]); + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+4]), delta, acc[iy]); + } + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto sumf = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1)); + info.store(ix, iy, _mm_mul_ps(d1, sumf)); + acc[iy] = _mm256_setzero_ps(); + } + } +} + #ifdef HAVE_FANCY_SIMD template <int nrc_y> static void mul_mat_q4_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { @@ -7042,14 +7647,14 @@ struct Q8_0_x4_Unpacker_512 { auto scales = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x[i].d)); for (int j = 0; j < 4; ++j) { qx[j] = _mm256_loadu_si256((const __m256i *)x[i].qs + j); - qx[j] = _mm256_xor_si256(qx[j], _mm256_set1_epi8(0x80)); + qx[j] = _mm256_xor_si256(qx[j], _mm256_set1_epi8(-128)); } return _mm256_set_m128(_mm_mul_ps(scales, min), scales); } inline auto set_block(int i) { auto q8 = (const block_q8_0 *)(x + i); qx[0] = _mm256_loadu_si256((const __m256i *)q8->qs); - qx[0] = _mm256_xor_si256(qx[0], _mm256_set1_epi8(0x80)); + qx[0] = _mm256_xor_si256(qx[0], _mm256_set1_epi8(-128)); float d = GGML_FP16_TO_FP32(q8->d); return std::make_pair(d, -128.f*d); } @@ -8202,6 +8807,21 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { mm.funcs[7] = mul_mat_q8_0_r4_q8_1<8>; expected_typeB = GGML_TYPE_Q8_1_X4; break; + case GGML_TYPE_IQ1_S_R4: + assert (ne00 % QK4_NL == 0); + mm.funcs[0] = mul_mat_iq1_s_r4_q8_1<1>; + mm.funcs[1] = mul_mat_iq1_s_r4_q8_1<2>; + mm.funcs[2] = mul_mat_iq1_s_r4_q8_1<3>; + mm.funcs[3] = mul_mat_iq1_s_r4_q8_1<4>; + mm.funcs[4] = mul_mat_iq1_s_r4_q8_1<5>; + mm.funcs[5] = mul_mat_iq1_s_r4_q8_1<6>; + mm.funcs[6] = mul_mat_iq1_s_r4_q8_1<7>; + mm.funcs[7] = mul_mat_iq1_s_r4_q8_1<8>; +#ifdef HAVE_FANCY_SIMD + mm.func16 = mul_mat_iq1_s_r4_q8_1<16>; +#endif + expected_typeB = GGML_TYPE_Q8_1_X4; + break; default: return false; @@ -11079,6 +11699,78 @@ static void mul_mat_iq2_xs_r4_q8_k(int n, const void * vx, size_t bx, const Data } template <int nrc_y> +static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(nrc_x%4 == 0); + Q8<nrc_y, block_q8_1_x4> q8(info); + int nb = n / 32; + GGML_ASSERT(nb%4 == 0); + int8x16_t qx[8]; + int32x4_t acc[nrc_y] = {}; + auto ms = vdup_n_u16(0x8000); + float d8[8*nrc_y]; + for (int ix= 0; ix < nrc_x; ix += 4) { + auto dptr = (const ggml_half *)((const char *)vx + ix*bx); + auto d1 = vcvt_f32_f16(vld1_f16((const float16_t *)dptr)); + auto x = (const block_iq1_s_r4 *)(dptr + 4); + for (int ib = 0; ib < nb/4; ++ib) { + for (int iy = 0; iy < nrc_y; ++iy) { + auto scales = vld1q_f16((const float16_t *)q8.y[iy][ib].d); + vst1q_f32(d8+8*iy+0, vcvt_f32_f16(vget_low_f16(scales))); + vst1q_f32(d8+8*iy+4, vcvt_f32_f16(vget_high_f16(scales))); + } + for (int k = 0; k < 4; ++k) { + auto sas = vld1_u16(x[4*ib+k].qh); + auto scales4 = vand_u16(vshr_n_u16(sas, 12), vdup_n_u16(7)); + scales4 = vorr_u16(vshl_n_u16(scales4, 1), vdup_n_u16(1)); + auto signs = vorr_u16(vceq_u16(vand_u16(sas, ms), ms), vdup_n_u16(1)); + auto delta4 = vmulq_f32(vdupq_n_f32(IQ1S_DELTA), vcvtq_f32_s32(vmull_s16(signs, scales4))); + qx[0] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 0] | ((x[4*ib+k].qh[0] << 8) & 0x0700)], + iq1s_grid[x[4*ib+k].qs[ 1] | ((x[4*ib+k].qh[1] << 8) & 0x0700)]}); + qx[1] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 8] | ((x[4*ib+k].qh[0] << 2) & 0x0700)], + iq1s_grid[x[4*ib+k].qs[ 9] | ((x[4*ib+k].qh[1] << 2) & 0x0700)]}); + qx[2] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 4] | ((x[4*ib+k].qh[0] << 5) & 0x0700)], + iq1s_grid[x[4*ib+k].qs[ 5] | ((x[4*ib+k].qh[1] << 5) & 0x0700)]}); + qx[3] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[12] | ((x[4*ib+k].qh[0] >> 1) & 0x0700)], + iq1s_grid[x[4*ib+k].qs[13] | ((x[4*ib+k].qh[1] >> 1) & 0x0700)]}); + qx[4] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 2] | ((x[4*ib+k].qh[2] << 8) & 0x0700)], + iq1s_grid[x[4*ib+k].qs[ 3] | ((x[4*ib+k].qh[3] << 8) & 0x0700)]}); + qx[5] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[10] | ((x[4*ib+k].qh[2] << 2) & 0x0700)], + iq1s_grid[x[4*ib+k].qs[11] | ((x[4*ib+k].qh[3] << 2) & 0x0700)]}); + qx[6] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[ 6] | ((x[4*ib+k].qh[2] << 5) & 0x0700)], + iq1s_grid[x[4*ib+k].qs[ 7] | ((x[4*ib+k].qh[3] << 5) & 0x0700)]}); + qx[7] = vreinterpretq_s8_u64(uint64x2_t{iq1s_grid[x[4*ib+k].qs[14] | ((x[4*ib+k].qh[2] >> 1) & 0x0700)], + iq1s_grid[x[4*ib+k].qs[15] | ((x[4*ib+k].qh[3] >> 1) & 0x0700)]}); + auto scales = vmovl_u16(scales4); + for (int iy = 0; iy < nrc_y; ++iy) { + auto sumi1 = vdupq_n_s32(0); + auto sumi2 = vdupq_n_s32(0); + auto y = vld1_s8_x4(q8.y[iy][ib].qs + 32*k); + auto y1 = vcombine_s8(y.val[0], y.val[0]); + auto y2 = vcombine_s8(y.val[1], y.val[1]); + sumi1 = ggml_vdotq_s32(sumi1, qx[0], y1); + sumi2 = ggml_vdotq_s32(sumi2, qx[4], y1); + sumi1 = ggml_vdotq_s32(sumi1, qx[2], y2); + sumi2 = ggml_vdotq_s32(sumi2, qx[6], y2); + y1 = vcombine_s8(y.val[2], y.val[2]); + y2 = vcombine_s8(y.val[3], y.val[3]); + sumi1 = ggml_vdotq_s32(sumi1, qx[1], y1); + sumi2 = ggml_vdotq_s32(sumi2, qx[5], y1); + sumi1 = ggml_vdotq_s32(sumi1, qx[3], y2); + sumi2 = ggml_vdotq_s32(sumi2, qx[7], y2); + auto sumi = vmulq_s32(scales, vpaddq_s32(sumi1, sumi2)); + acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[8*iy+k+0]), vcvtq_f32_s32(sumi)); + acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[8*iy+k+4]), delta4); + } + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, vmulq_f32(d1, acc[iy])); + acc[iy] = vdupq_n_f32(0.f); + } + } +} + +template <int nrc_y> static void mul_mat_iq2_s_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); Q8<nrc_y, block_q8_K> q8(info); @@ -12697,6 +13389,11 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) { m.func16 = mul_mat_iq2_s_r4_q8_k<16>; expected_Btype = GGML_TYPE_Q8_K; break; + case GGML_TYPE_IQ1_S_R4: + SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq1_s_r4_q8_1); + m.func16 = mul_mat_iq1_s_r4_q8_1<16>; + expected_Btype = GGML_TYPE_Q8_1_X4; + break; case GGML_TYPE_IQ3_XXS_R4: SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq3_xxs_r4_q8_k); m.func16 = mul_mat_iq3_xxs_r4_q8_k<16>; @@ -12995,7 +13692,7 @@ struct F16 { using Data = float16x8_t; constexpr static int block_size = 8; //constexpr static int num_registers = 32; - constexpr static int q_step = 8; + //constexpr static int q_step = 8; static inline Data zero() { return vdupq_n_f16(0); } static inline Data load(const char * ptr, int i) { return vld1q_f16((const float16_t *)ptr + block_size*i); } static inline Data load(const float16_t * ptr, int i) { return vld1q_f16(ptr + block_size*i); } diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index c1e7771f..a8553b43 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -6087,6 +6087,112 @@ void vec_dot_iq3_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t GGML_UNUSED(by); } +void quantize_row_iq1_s_r4_ref(const float * x, block_iq1_s_r4 * y, int64_t k) { + quantize_iq1_s_r4(x, y, 4, k/4, nullptr); +} + +void quantize_row_iq1_s_r4(const float * x, void * y, int64_t k) { + quantize_iq1_s_r4(x, y, 4, k/4, nullptr); +} + +size_t quantize_iq1_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + constexpr int kBlockSize = 32; + GGML_ASSERT(nrows%4 == 0); + GGML_ASSERT(n_per_row%kBlockSize == 0); + int nblock = n_per_row/kBlockSize; + float weight[kBlockSize]; + int8_t L[kBlockSize]; + float pairs[2*kBlockSize]; + float sumx[kBlockSize+1], sumw[kBlockSize+1]; + float max[4]; + uint16_t index[4]; + int shift; + float invd[4]; + std::vector<float> scales(4*nblock); + auto row_size = ggml_row_size(GGML_TYPE_IQ1_S_R4, n_per_row); + char * cy = (char *)dst; + for (int row = 0; row < nrows; row += 4) { + ggml_half * dptr = (ggml_half *)cy; + auto y = (block_iq1_s_r4 *)(dptr + 4); + for (int k = 0; k < 4; ++k) max[k] = 0; + for (int ibl = 0; ibl < nblock; ++ibl) { + if (imatrix) { + for (int j = 0; j < kBlockSize; ++j) weight[j] = imatrix[kBlockSize*ibl + j]; + } + for (int k = 0; k < 4; ++k) { + auto xb = src + k*n_per_row + kBlockSize*ibl; + float sumx2 = 0; + for (int j = 0; j < kBlockSize; ++j) sumx2 += xb[j]*xb[j]; + float sigma2 = 1.5f*sumx2/kBlockSize; + if (imatrix) { + for (int j = 0; j < kBlockSize; ++j) weight[j] = imatrix[kBlockSize*ibl + j]*sqrt(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = sqrt(sigma2 + xb[j]*xb[j]); + } + iq1s_process_1block(kBlockSize, xb, weight, L, scales.data() + 4*ibl + k, index, &shift, pairs, sumx, sumw); + max[k] = std::max(max[k], scales[4*ibl+k]); + uint16_t h = 0; + for (int i = 0; i < 4; ++i) { + y[ibl].qs[4*i + k] = index[i] & 255; + h |= (index[i] >> 8) << 3*i; + } + if (shift < 0) h |= 0x8000; + y[ibl].qh[k] = h; + } + } + for (int k = 0; k < 4; ++k) { + dptr[k] = GGML_FP32_TO_FP16(1.0625f*max[k]/15);; + invd[k] = max[k] ? 15/max[k] : 0.f; + } + for (int ibl = 0; ibl < nblock; ++ibl) { + for (int k = 0; k < 4; ++k) { + int ls = nearest_int(0.5f*(scales[4*ibl+k]*invd[k] - 1)); + ls = std::max(0, std::min(7, ls)); + y[ibl].qh[k] |= (ls << 12); + } + } + cy += 4*row_size; + src += 4*n_per_row; + } + return nrows*row_size; +} + +void dequantize_row_iq1_s_r4(const block_iq1_s_r4 * x, float * y, int64_t n) { + auto dptr = (const ggml_half *)x; + x = (const block_iq1_s_r4 *)(dptr + 4); + float d[4]; + for (int k = 0; k < 4; ++k) d[k] = GGML_FP16_TO_FP32(dptr[k]); + int n_per_row = n/4; + GGML_ASSERT(n_per_row%32 == 0); + int nblock = n_per_row/32; + float * yk[4]; + for (int k = 0; k < 4; ++k) yk[k] = y + k*n_per_row; + for (int ib = 0; ib < nblock; ++ib) { + for (int k = 0; k < 4; ++k) { + float shift = x[ib].qh[k] & 0x8000 ? -IQ1S_DELTA : IQ1S_DELTA; + float dl = d[k]*(2*((x[ib].qh[k] >> 12) & 7) + 1); + for (int i = 0; i < 4; ++i) { + auto idx = x[ib].qs[4*i+k] | (((x[ib].qh[k] >> 3*i) & 7) << 8); + auto grid = (const int8_t *)(iq1s_grid + idx); + for (int j = 0; j < 8; ++j) yk[k][32*ib + 8*i + j] = dl*(grid[j] + shift); + } + } + } +} + +void vec_dot_iq1_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ1_S_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { + return; + } +#endif + GGML_ASSERT(n%QK4_NL == 0); + GGML_ASSERT(nrc == 1); + GGML_UNUSED(bs); + GGML_UNUSED(bx); + GGML_UNUSED(by); +} + //================================================ namespace { diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 1a991787..9a3c5dc6 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -199,6 +199,12 @@ size_t quantize_iq3_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT d void dequantize_row_iq3_s_r4(const block_iq3_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq3_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_iq1_s_r4_ref(const float * GGML_RESTRICT x, block_iq1_s_r4 * GGML_RESTRICT y, int64_t k); +void quantize_row_iq1_s_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_iq1_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_iq1_s_r4(const block_iq1_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_iq1_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void quantize_row_q8_k_r8_ref(const float * GGML_RESTRICT x, block_q8_k_r8 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_k_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/include/llama.h b/include/llama.h index c21671c6..0f6d15ac 100644 --- a/include/llama.h +++ b/include/llama.h @@ -192,6 +192,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 = 219, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 = 220, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 = 223, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_S_R4 = 224, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 = 225, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_S_R4 = 226, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M_R4 = 229, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 570c056c..943b945a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3954,6 +3954,7 @@ struct llama_model_loader { case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ3_XXS_R4: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case GGML_TYPE_IQ1_S_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ1_S_R4;break; case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break; case GGML_TYPE_IQ2_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN; break; @@ -4688,6 +4689,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: return "IQ3_XXS_R4 - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_S_R4: return "IQ1_S_R4 - 1.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL_R4:return "IQ4_NL_R4 - 4.5 bpw"; @@ -15966,7 +15968,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4 || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) { + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) { @@ -15987,7 +15990,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { new_type = GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) { @@ -16064,6 +16068,41 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_BF16; } } + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { + if (name.find("attn_v.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 4 || qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ4_K_R4; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_K_R4; + else new_type = GGML_TYPE_Q2_K_R4; + ++qs.i_attention_wv; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) { + new_type = GGML_TYPE_Q4_K_R4; + } + else if (qs.model.hparams.n_expert >= 8 && (name.find("blk.0.ffn_down") != std::string::npos || + name.find("blk.0.ffn_gate") != std::string::npos || + name.find("blk.0.ffn_up") != std::string::npos)) { + new_type = GGML_TYPE_IQ3_K_R4; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) { + new_type = GGML_TYPE_Q4_K_R4; + } + else if (name.find("attn_qkv.weight") != std::string::npos) { + new_type = GGML_TYPE_IQ2_K_R4; + } + else if (name.find("_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_IQ4_K_R4; + } + else if (name.find("ffn_down") != std::string::npos) { + auto [i_layer, n_layer] = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type; + else if (i_layer < n_layer/8) { + new_type = GGML_TYPE_Q2_K_R4; + } + ++qs.i_ffn_down; + } + else if (name.find("attn_output.weight") != std::string::npos) { + new_type = qs.model.hparams.n_expert >= 4 ? GGML_TYPE_Q5_K_R4 : GGML_TYPE_IQ2_K_R4; + } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 || @@ -16095,6 +16134,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) new_type = GGML_TYPE_IQ2_K_R4; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m) new_type = GGML_TYPE_IQ3_S; } } @@ -16539,6 +16579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: default_type = GGML_TYPE_IQ3_XXS_R4; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; + case LLAMA_FTYPE_MOSTLY_IQ1_S_R4:default_type = GGML_TYPE_IQ1_S_R4;break; case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ1_BN: default_type = GGML_TYPE_IQ1_BN; break; case LLAMA_FTYPE_MOSTLY_IQ2_BN: default_type = GGML_TYPE_IQ2_BN; break; @@ -16892,6 +16933,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ2_S_R4|| new_type == GGML_TYPE_IQ1_S || + new_type == GGML_TYPE_IQ1_S_R4|| (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) { LLAMA_LOG_ERROR("\n\n============================================================\n"); @@ -17011,6 +17053,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_S; else chunk_size_multiplier = 4; } + else if (new_type == GGML_TYPE_IQ1_S_R4) { + if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S; + else chunk_size_multiplier = 4; + } else if (new_type == GGML_TYPE_BF16_R16) { if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16; else chunk_size_multiplier = 16; |