diff options
-rw-r--r-- | examples/quantize/quantize.cpp | 2 | ||||
-rw-r--r-- | ggml/include/ggml.h | 2 | ||||
-rw-r--r-- | ggml/src/ggml-common.h | 8 | ||||
-rw-r--r-- | ggml/src/ggml-quants.c | 403 | ||||
-rw-r--r-- | ggml/src/ggml-quants.h | 4 | ||||
-rw-r--r-- | ggml/src/ggml.c | 27 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_mul_mat.cpp | 197 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 117 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.h | 6 | ||||
-rw-r--r-- | include/llama.h | 1 | ||||
-rw-r--r-- | src/llama.cpp | 16 |
11 files changed, 553 insertions, 230 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1c847e6b..7bdd8597 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -30,6 +30,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = { { "IQ2_M_R4", LLAMA_FTYPE_MOSTLY_IQ2_M_R4, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_S_R4", LLAMA_FTYPE_MOSTLY_IQ1_S_R4, " 1.5 bpw quantization", }, + { "IQ1_M_R4", LLAMA_FTYPE_MOSTLY_IQ1_M_R4, " 1.75 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", }, { "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", }, @@ -512,6 +513,7 @@ int main(int argc, char ** argv) { params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4 || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) { fprintf(stderr, "\n==========================================================================================================\n"); fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 9668dc32..77ac33a9 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -432,6 +432,7 @@ extern "C" { GGML_TYPE_IQ3_S_R4 = 221, GGML_TYPE_IQ2_S_R4 = 222, GGML_TYPE_IQ4_XS_R4 = 223, + GGML_TYPE_IQ1_M_R4 = 229, GGML_TYPE_BF16_R16 = 230, GGML_TYPE_Q6_0_R4 = 233, GGML_TYPE_IQ2_BN_R4 = 335, @@ -516,6 +517,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ3_S_R4 = 220, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_S_R4 = 221, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ1_M_R4 = 223, // except 1d tensors GGML_FTYPE_MOSTLY_BF16_R16 = 224, // except 1d tensors GGML_FTYPE_MOSTLY_Q6_0_R4 = 227, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_BN_R4 = 329, // except 1d tensors diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 14813161..679353be 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -499,6 +499,14 @@ typedef struct { } block_iq1_m; static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding"); +// 1.75 bpw - blocks of 32 with 4 interleaved rows = 128 quants +typedef struct { + uint8_t qs[16]; // grid index, low 8 bits + uint8_t qh[ 8]; // grid index, high 3 bits + grid shift bits (for two groups of 8) + uint8_t scales[4]; // 4-bit block scales +} block_iq1_m_r4; +static_assert(sizeof(block_iq1_m_r4) == 28, "wrong iq1_m_r4 block size/padding"); + // // Bitnet and TriLM - implemented as 1.625 bpw // diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 3c4711f3..d32a583f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -14145,85 +14145,6 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy int best_shift; iq1s_process_1block(block_size, xb, weight, L, &scales[ib], index, &best_shift, pairs, sumx, sumw); -// float max = fabsf(xb[0]); -// for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); -// if (max < GROUP_MAX_EPS_IQ1_S) { -// scales[ib] = 0; -// memset(L, 1, block_size); -// continue; -// } -// // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. -// // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two -// // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights -// // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and -// // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale -// // for each possible and score for each split. -// for (int j = 0; j < block_size; ++j) { -// pairs[2*j] = xb[j]; -// idx[2*j] = j; -// } -// qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); -// { -// sumx[0] = sumw[0] = 0; -// for (int j = 0; j < block_size; ++j) { -// int i = idx[2*j]; -// sumx[j+1] = sumx[j] + weight[i]*xb[i]; -// sumw[j+1] = sumw[j] + weight[i]; -// } -// } -// float best_score = -FLT_MIN, scale = max; -// int besti1 = -1, besti2 = -1, best_shift = 0; -// for (int i1 = 0; i1 <= block_size; ++i1) { -// for (int i2 = i1; i2 <= block_size; ++i2) { -// float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; -// float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; -// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { -// scale = sumqx/sumq2; best_score = scale*sumqx; -// besti1 = i1; besti2 = i2; best_shift = 1; -// } -// sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; -// sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; -// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { -// scale = sumqx/sumq2; best_score = scale*sumqx; -// besti1 = i1; besti2 = i2; best_shift = -1; -// } -// } -// } -// GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); -// for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; -// for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; -// for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; -// if (scale < 0) { -// for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; -// scale = -scale; best_shift = -best_shift; -// } -// bool all_on_grid = true; -// const float * xx = best_shift == 1 ? x_p : x_m; -// for (int k = 0; k < block_size/8; ++k) { -// uint16_t u = 0; -// for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); -// int grid_index = kmap_q2xs[u]; -// if (grid_index < 0) { -// all_on_grid = false; -// const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; -// grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); -// GGML_ASSERT(grid_index >= 0); -// } -// index[k] = grid_index; -// } -// if (!all_on_grid) { -// float sumqx = 0, sumq2 = 0; -// for (int k = 0; k < block_size/8; ++k) { -// const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); -// for (int j = 0; j < 8; ++j) { -// float w = weight[8*k + j]; -// float q = xx[(pg[j] - 1)/2]; -// sumqx += w*q*xb[8*k+j]; -// sumq2 += w*q*q; -// } -// } -// if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; -// } uint16_t h = 0; for (int k = 0; k < block_size/8; ++k) { y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255; @@ -14232,10 +14153,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy y[ibl].qh[ib] = h; GGML_ASSERT(scales[ib] >= 0); max_scale = MAX(max_scale, scales[ib]); - //GGML_ASSERT(scale >= 0); - //scales[ib] = scale; shifts[ib] = best_shift; - //max_scale = MAX(max_scale, scale); } if (!max_scale) { @@ -14287,6 +14205,166 @@ void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, quantize_row_iq1_s_ref(x, (block_iq1_s *)y, k); } +void iq1m_process_1block(const float * xb, const float * weight, int8_t * L, float * the_scale, uint16_t * the_index, int * the_shift, + float * pairs) { + + const int block_size = IQ1M_BLOCK_SIZE; + + const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA}; + const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA}; + + float sumqx[4], sumq2[4]; + + const int gindex = iq2_data_index(GGML_TYPE_IQ1_M); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); + + // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. + // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two + // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights + // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and + // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale + // for each possible and score for each split. + int * idx = (int *)(pairs + 1); + for (int j = 0; j < block_size; ++j) { + pairs[2*j] = xb[j]; + idx[2*j] = j; + } + qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); + float best_score = -FLT_MIN, scale = 0.f; + int besti1 = -1, besti2 = -1, best_k = -1; + // 0: +, + + // 1: +, - + // 2: -, + + // 3: -, - + for (int i1 = 0; i1 <= block_size; ++i1) { + for (int i2 = i1; i2 <= block_size; ++i2) { + memset(sumqx, 0, 4*sizeof(float)); + memset(sumq2, 0, 4*sizeof(float)); + for (int j = 0; j < i1; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[0]*xb[i]; + sumqx[1] += weight[i]*x_p[0]*xb[i]; + sumqx[2] += weight[i]*x_m[0]*xb[i]; + sumqx[3] += weight[i]*x_m[0]*xb[i]; + sumq2[0] += weight[i]*x_p[0]*x_p[0]; + sumq2[1] += weight[i]*x_p[0]*x_p[0]; + sumq2[2] += weight[i]*x_m[0]*x_m[0]; + sumq2[3] += weight[i]*x_m[0]*x_m[0]; + } else { + sumqx[0] += weight[i]*x_p[0]*xb[i]; + sumqx[2] += weight[i]*x_p[0]*xb[i]; + sumqx[1] += weight[i]*x_m[0]*xb[i]; + sumqx[3] += weight[i]*x_m[0]*xb[i]; + sumq2[0] += weight[i]*x_p[0]*x_p[0]; + sumq2[2] += weight[i]*x_p[0]*x_p[0]; + sumq2[1] += weight[i]*x_m[0]*x_m[0]; + sumq2[3] += weight[i]*x_m[0]*x_m[0]; + } + } + for (int j = i1; j < i2; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[1]*xb[i]; + sumqx[1] += weight[i]*x_p[1]*xb[i]; + sumqx[2] += weight[i]*x_m[1]*xb[i]; + sumqx[3] += weight[i]*x_m[1]*xb[i]; + sumq2[0] += weight[i]*x_p[1]*x_p[1]; + sumq2[1] += weight[i]*x_p[1]*x_p[1]; + sumq2[2] += weight[i]*x_m[1]*x_m[1]; + sumq2[3] += weight[i]*x_m[1]*x_m[1]; + } else { + sumqx[0] += weight[i]*x_p[1]*xb[i]; + sumqx[2] += weight[i]*x_p[1]*xb[i]; + sumqx[1] += weight[i]*x_m[1]*xb[i]; + sumqx[3] += weight[i]*x_m[1]*xb[i]; + sumq2[0] += weight[i]*x_p[1]*x_p[1]; + sumq2[2] += weight[i]*x_p[1]*x_p[1]; + sumq2[1] += weight[i]*x_m[1]*x_m[1]; + sumq2[3] += weight[i]*x_m[1]*x_m[1]; + } + } + for (int j = i2; j < block_size; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[2]*xb[i]; + sumqx[1] += weight[i]*x_p[2]*xb[i]; + sumqx[2] += weight[i]*x_m[2]*xb[i]; + sumqx[3] += weight[i]*x_m[2]*xb[i]; + sumq2[0] += weight[i]*x_p[2]*x_p[2]; + sumq2[1] += weight[i]*x_p[2]*x_p[2]; + sumq2[2] += weight[i]*x_m[2]*x_m[2]; + sumq2[3] += weight[i]*x_m[2]*x_m[2]; + } else { + sumqx[0] += weight[i]*x_p[2]*xb[i]; + sumqx[2] += weight[i]*x_p[2]*xb[i]; + sumqx[1] += weight[i]*x_m[2]*xb[i]; + sumqx[3] += weight[i]*x_m[2]*xb[i]; + sumq2[0] += weight[i]*x_p[2]*x_p[2]; + sumq2[2] += weight[i]*x_p[2]*x_p[2]; + sumq2[1] += weight[i]*x_m[2]*x_m[2]; + sumq2[3] += weight[i]*x_m[2]*x_m[2]; + } + } + for (int k = 0; k < 4; ++k) { + if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) { + scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k]; + besti1 = i1; besti2 = i2; best_k = k; + } + } + } + } + GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0); + for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; + for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; + for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; + if (scale < 0) { + for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; + scale = -scale; + best_k = 3 - best_k; + } + bool all_on_grid = true; + const float * xx; + for (int k = 0; k < block_size/8; ++k) { + if (k == 0) xx = best_k < 2 ? x_p : x_m; + else xx = best_k%2 == 0 ? x_p : x_m; + uint16_t u = 0; + for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + all_on_grid = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); + GGML_ASSERT(grid_index >= 0); + } + the_index[k] = grid_index; + } + if (!all_on_grid) { + float sumqx_f = 0, sumq2_f = 0; + for (int k = 0; k < block_size/8; ++k) { + if (k == 0) xx = best_k < 2 ? x_p : x_m; + else xx = best_k%2 == 0 ? x_p : x_m; + const int8_t * pg = (const int8_t *)(kgrid_q2xs + the_index[k]); + for (int j = 0; j < 8; ++j) { + float w = weight[8*k + j]; + float q = xx[(pg[j] - 1)/2]; + sumqx_f += w*q*xb[8*k+j]; + sumq2_f += w*q*q; + } + } + if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f; + } + *the_scale = scale; + *the_shift = best_k; +} + static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, float * scales, float * weight, @@ -14301,7 +14379,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy const int * kmap_q2xs = iq2_data[gindex].map; const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; - //GGML_ASSERT(quant_weights && "missing quantization weights"); GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); @@ -14317,10 +14394,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA}; const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88}; - int * idx = (int *)(pairs + 1); - - float sumqx[4], sumq2[4]; - iq1m_scale_t s; const float * xx; @@ -14351,147 +14424,15 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy memset(L, 1, block_size); continue; } - // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. - // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two - // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights - // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and - // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale - // for each possible and score for each split. - for (int j = 0; j < block_size; ++j) { - pairs[2*j] = xb[j]; - idx[2*j] = j; - } - qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); - float best_score = -FLT_MIN, scale = max; - int besti1 = -1, besti2 = -1, best_k = -1; - // 0: +, + - // 1: +, - - // 2: -, + - // 3: -, - - for (int i1 = 0; i1 <= block_size; ++i1) { - for (int i2 = i1; i2 <= block_size; ++i2) { - memset(sumqx, 0, 4*sizeof(float)); - memset(sumq2, 0, 4*sizeof(float)); - for (int j = 0; j < i1; ++j) { - int i = idx[2*j]; - if (i < block_size/2) { - sumqx[0] += weight[i]*x_p[0]*xb[i]; - sumqx[1] += weight[i]*x_p[0]*xb[i]; - sumqx[2] += weight[i]*x_m[0]*xb[i]; - sumqx[3] += weight[i]*x_m[0]*xb[i]; - sumq2[0] += weight[i]*x_p[0]*x_p[0]; - sumq2[1] += weight[i]*x_p[0]*x_p[0]; - sumq2[2] += weight[i]*x_m[0]*x_m[0]; - sumq2[3] += weight[i]*x_m[0]*x_m[0]; - } else { - sumqx[0] += weight[i]*x_p[0]*xb[i]; - sumqx[2] += weight[i]*x_p[0]*xb[i]; - sumqx[1] += weight[i]*x_m[0]*xb[i]; - sumqx[3] += weight[i]*x_m[0]*xb[i]; - sumq2[0] += weight[i]*x_p[0]*x_p[0]; - sumq2[2] += weight[i]*x_p[0]*x_p[0]; - sumq2[1] += weight[i]*x_m[0]*x_m[0]; - sumq2[3] += weight[i]*x_m[0]*x_m[0]; - } - } - for (int j = i1; j < i2; ++j) { - int i = idx[2*j]; - if (i < block_size/2) { - sumqx[0] += weight[i]*x_p[1]*xb[i]; - sumqx[1] += weight[i]*x_p[1]*xb[i]; - sumqx[2] += weight[i]*x_m[1]*xb[i]; - sumqx[3] += weight[i]*x_m[1]*xb[i]; - sumq2[0] += weight[i]*x_p[1]*x_p[1]; - sumq2[1] += weight[i]*x_p[1]*x_p[1]; - sumq2[2] += weight[i]*x_m[1]*x_m[1]; - sumq2[3] += weight[i]*x_m[1]*x_m[1]; - } else { - sumqx[0] += weight[i]*x_p[1]*xb[i]; - sumqx[2] += weight[i]*x_p[1]*xb[i]; - sumqx[1] += weight[i]*x_m[1]*xb[i]; - sumqx[3] += weight[i]*x_m[1]*xb[i]; - sumq2[0] += weight[i]*x_p[1]*x_p[1]; - sumq2[2] += weight[i]*x_p[1]*x_p[1]; - sumq2[1] += weight[i]*x_m[1]*x_m[1]; - sumq2[3] += weight[i]*x_m[1]*x_m[1]; - } - } - for (int j = i2; j < block_size; ++j) { - int i = idx[2*j]; - if (i < block_size/2) { - sumqx[0] += weight[i]*x_p[2]*xb[i]; - sumqx[1] += weight[i]*x_p[2]*xb[i]; - sumqx[2] += weight[i]*x_m[2]*xb[i]; - sumqx[3] += weight[i]*x_m[2]*xb[i]; - sumq2[0] += weight[i]*x_p[2]*x_p[2]; - sumq2[1] += weight[i]*x_p[2]*x_p[2]; - sumq2[2] += weight[i]*x_m[2]*x_m[2]; - sumq2[3] += weight[i]*x_m[2]*x_m[2]; - } else { - sumqx[0] += weight[i]*x_p[2]*xb[i]; - sumqx[2] += weight[i]*x_p[2]*xb[i]; - sumqx[1] += weight[i]*x_m[2]*xb[i]; - sumqx[3] += weight[i]*x_m[2]*xb[i]; - sumq2[0] += weight[i]*x_p[2]*x_p[2]; - sumq2[2] += weight[i]*x_p[2]*x_p[2]; - sumq2[1] += weight[i]*x_m[2]*x_m[2]; - sumq2[3] += weight[i]*x_m[2]*x_m[2]; - } - } - for (int k = 0; k < 4; ++k) { - if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) { - scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k]; - besti1 = i1; besti2 = i2; best_k = k; - } - } - } - } - GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0); - for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; - for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; - for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; - if (scale < 0) { - for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; - scale = -scale; - best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0; - } - bool all_on_grid = true; - for (int k = 0; k < block_size/8; ++k) { - if (k == 0) xx = best_k < 2 ? x_p : x_m; - else xx = best_k%2 == 0 ? x_p : x_m; - uint16_t u = 0; - for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); - int grid_index = kmap_q2xs[u]; - if (grid_index < 0) { - all_on_grid = false; - const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; - grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); - GGML_ASSERT(grid_index >= 0); - } - index[k] = grid_index; - } - if (!all_on_grid) { - float sumqx_f = 0, sumq2_f = 0; - for (int k = 0; k < block_size/8; ++k) { - if (k == 0) xx = best_k < 2 ? x_p : x_m; - else xx = best_k%2 == 0 ? x_p : x_m; - const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); - for (int j = 0; j < 8; ++j) { - float w = weight[8*k + j]; - float q = xx[(pg[j] - 1)/2]; - sumqx_f += w*q*xb[8*k+j]; - sumq2_f += w*q*q; - } - } - if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f; - } + + int best_k = -1; + iq1m_process_1block(xb, weight, L, &scales[ib], index, &best_k, pairs); + y[ibl].qs[2*ib + 0] = index[0] & 255; y[ibl].qs[2*ib + 1] = index[1] & 255; y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4); - GGML_ASSERT(scale >= 0); - scales[ib] = scale; shifts[ib] = best_k; - max_scale = MAX(max_scale, scale); + max_scale = MAX(max_scale, scales[ib]); } if (!max_scale) { @@ -14553,6 +14494,19 @@ size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t n return nrow * nblock * sizeof(block_iq1_m); } +void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_m * GGML_RESTRICT y, int64_t k) { + int nblock = k/QK_K; + float qw[QK_K]; + for (int j = 0; j < QK_K; ++j) qw[j] = 1; + for (int ibl = 0; ibl < nblock; ++ibl) { + quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw); + } +} + +void quantize_row_iq1_m (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_iq1_m_ref(x, (block_iq1_m *)y, k); +} + // ============================ 4-bit non-linear quants static const int8_t iq4nl_index[241] = { @@ -15246,6 +15200,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ3_S_R4: break; case GGML_TYPE_IQ2_S_R4: break; case GGML_TYPE_IQ1_S_R4: break; + case GGML_TYPE_IQ1_M_R4: break; case GGML_TYPE_Q4_0_R4: break; case GGML_TYPE_Q5_0_R4: break; case GGML_TYPE_Q6_0_R4: break; diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 4753f342..7c8e2110 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -43,6 +43,7 @@ void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGM void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k); +void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_m * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -68,6 +69,7 @@ void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq1_m (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -152,6 +154,8 @@ void iq3xs_free_impl(int grid_size); void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L, float * the_scale, uint16_t * the_index, int * the_shift, float * pairs, float * sumx, float * sumw); +void iq1m_process_1block(const float * xb, const float * weight, int8_t * L, + float * the_scale, uint16_t * the_index, int * the_shift, float * pairs); #if defined(__ARM_FEATURE_SVE) extern int ggml_sve_cnt_b; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 64b7d3ce..4199a282 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1202,13 +1202,26 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_iq1_m), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq1_m, - .from_float = NULL, - .from_float_ref = NULL, + .from_float = quantize_row_iq1_m, + .from_float_ref = (ggml_from_float_t)quantize_row_iq1_m_ref, .vec_dot = ggml_vec_dot_iq1_m_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, .row_meta_size = 0, }, + [GGML_TYPE_IQ1_M_R4] = { + .type_name = "iq1_m_r4", + .blck_size = 32, + .type_size = sizeof(block_iq1_m_r4)/4, + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_m_r4, + .from_float = quantize_row_iq1_m_r4, + .from_float_ref = (ggml_from_float_t)quantize_row_iq1_m_r4_ref, + .vec_dot = vec_dot_iq1_m_r4_q8_k, + .vec_dot_type = GGML_TYPE_Q8_0_X4, + .nrows = 1, + .row_meta_size = 2, + }, [GGML_TYPE_IQ1_BN] = { .type_name = "iq1_bn", .blck_size = QK_IQ1BN, @@ -4401,6 +4414,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; case GGML_FTYPE_MOSTLY_IQ2_S_R4: wtype = GGML_TYPE_IQ2_S_R4; break; case GGML_FTYPE_MOSTLY_IQ1_S_R4: wtype = GGML_TYPE_IQ1_S_R4; break; + case GGML_FTYPE_MOSTLY_IQ1_M_R4: wtype = GGML_TYPE_IQ1_M_R4; break; case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; @@ -10949,6 +10963,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -11418,6 +11433,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -11584,6 +11600,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -14823,6 +14840,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -15229,6 +15247,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -15529,6 +15548,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -16158,6 +16178,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_Q8_K: case GGML_TYPE_Q8_K64: case GGML_TYPE_Q8_K16: @@ -22914,6 +22935,7 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break; + case GGML_TYPE_IQ1_M_R4:iq2xs_init_impl(GGML_TYPE_IQ1_M); break; case GGML_TYPE_IQ1_S_R4:iq2xs_init_impl(GGML_TYPE_IQ1_S); break; case GGML_TYPE_IQ3_XXS_R4: case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; @@ -22998,6 +23020,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ1_M_R4:result = quantize_iq1_m_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index ea8e8274..57024602 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -260,6 +260,7 @@ struct MulMat { case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ3_XXS_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_IQ3_S_R4: return 4; case GGML_TYPE_IQ4_NL_R4: case GGML_TYPE_Q5_0_R4: @@ -295,6 +296,7 @@ struct MulMat { case GGML_TYPE_IQ3_XXS_R4: case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_IQ2_BN_R4: return 4; case GGML_TYPE_IQ4_XS_R4: case GGML_TYPE_Q4_0_R4: @@ -3609,6 +3611,102 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI } } +template <int nrc_y> +static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(nrc_x%4 == 0); + Q8<nrc_y, block_q8_0_x4> q8(info); + int nb = n / 32; + GGML_ASSERT(nb%4 == 0); + auto shuffle0 = _mm256_set_epi64x(0x0909090909090909, 0x0808080808080808, 0x0101010101010101, 0x0000000000000000); + auto step = _mm256_set1_epi8(2); +#ifndef HAVE_FANCY_SIMD + auto m1 = _mm256_set1_epi16(1); +#endif + __m256i qx[4]; + __m256 acc[nrc_y] = {}; + auto ms = _mm_set1_epi8(0x08); + float d8[4*nrc_y]; + union { __m256i vec; uint16_t val[16]; } helper; + for (int ix= 0; ix < nrc_x; ix += 4) { + auto dptr = (const ggml_half *)((const char *)vx + ix*bx); + auto d1 = _mm_mul_ps(_mm_set1_ps(0.125f), _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)dptr))); + auto x = (const block_iq1_m_r4 *)(dptr + 4); + for (int ib = 0; ib < nb/4; ++ib) { + for (int iy = 0; iy < nrc_y; ++iy) { + _mm_storeu_ps(d8 + 4*iy, _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib].d))); + } + for (int k = 0; k < 4; ++k) { + auto qh = (const uint32_t *)x[4*ib+k].qh; + auto idxh = _mm_set_epi32(qh[1] >> 4, qh[1], qh[0] >> 4, qh[0]); + auto scales4 = _mm_set1_epi32(((const uint32_t *)x[4*ib+k].scales)[0]); + scales4 = _mm_and_si128(_mm_srlv_epi32(scales4, _mm_set_epi32(4, 0, 4, 0)), _mm_set1_epi8(0xf)); + scales4 = _mm_cvtepu8_epi16(scales4); + auto scales = MM256_SET_M128I(_mm_unpackhi_epi16(scales4, scales4), _mm_unpacklo_epi16(scales4, scales4)); + + auto signs128 = _mm_or_si128(_mm_cmpeq_epi8(_mm_and_si128(idxh, ms), ms), _mm_set1_epi8(1)); + signs128 = _mm_add_epi8(_mm_set1_epi8(-8), signs128); + auto signs = MM256_SET_M128I(signs128, signs128); + auto idxl = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)x[4*ib+k].qs)); + idxh = _mm_and_si128(idxh, _mm_set1_epi8(0x07)); + helper.vec = _mm256_or_si256(idxl, _mm256_slli_epi16(_mm256_cvtepu8_epi16(idxh), 8)); + qx[0] = _mm256_set_epi64x(iq1s_grid_us[helper.val[ 9]], iq1s_grid_us[helper.val[ 8]], + iq1s_grid_us[helper.val[ 1]], iq1s_grid_us[helper.val[ 0]]); + qx[1] = _mm256_set_epi64x(iq1s_grid_us[helper.val[13]], iq1s_grid_us[helper.val[12]], + iq1s_grid_us[helper.val[ 5]], iq1s_grid_us[helper.val[ 4]]); + qx[2] = _mm256_set_epi64x(iq1s_grid_us[helper.val[11]], iq1s_grid_us[helper.val[10]], + iq1s_grid_us[helper.val[ 3]], iq1s_grid_us[helper.val[ 2]]); + qx[3] = _mm256_set_epi64x(iq1s_grid_us[helper.val[15]], iq1s_grid_us[helper.val[14]], + iq1s_grid_us[helper.val[ 7]], iq1s_grid_us[helper.val[ 6]]); + qx[0] = _mm256_add_epi8(_mm256_slli_epi16(qx[0], 3), _mm256_shuffle_epi8(signs, shuffle0)); + auto shuffle = _mm256_add_epi8(shuffle0, step); + qx[2] = _mm256_add_epi8(_mm256_slli_epi16(qx[2], 3), _mm256_shuffle_epi8(signs, shuffle)); + shuffle = _mm256_add_epi8(shuffle, step); + qx[1] = _mm256_add_epi8(_mm256_slli_epi16(qx[1], 3), _mm256_shuffle_epi8(signs, shuffle)); + shuffle = _mm256_add_epi8(shuffle, step); + qx[3] = _mm256_add_epi8(_mm256_slli_epi16(qx[3], 3), _mm256_shuffle_epi8(signs, shuffle)); + auto s0 = _mm256_sign_epi8(qx[0], qx[0]); + auto s1 = _mm256_sign_epi8(qx[1], qx[1]); + auto s2 = _mm256_sign_epi8(qx[2], qx[2]); + auto s3 = _mm256_sign_epi8(qx[3], qx[3]); + for (int iy = 0; iy < nrc_y; ++iy) { + auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ib].qs + k); + auto y1 = _mm256_shuffle_epi32(y, 0x44); + auto y2 = _mm256_shuffle_epi32(y, 0xee); +#ifdef HAVE_FANCY_SIMD + // 0,0, 1,1, 0,0, 1,1 as int32_t + auto sumi1 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), + s0, _mm256_sign_epi8(y1, qx[0])), s1, _mm256_sign_epi8(y2, qx[1])); + // 2,2, 3,3, 2,2, 3,3 as int32_t + auto sumi2 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), + s2, _mm256_sign_epi8(y1, qx[2])), s3, _mm256_sign_epi8(y2, qx[3])); + auto sumi = _mm256_packs_epi32(sumi1, sumi2); +#else + // 4 x row 0, 4 x row 1, 4 x row 0, 4 x row 1 + auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(s0, _mm256_sign_epi8(y1, qx[0])), + _mm256_maddubs_epi16(s1, _mm256_sign_epi8(y2, qx[1]))); + // 4 x row 2, 4 x row 3, 4 x row 2, 4 x row 3 + auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(s2, _mm256_sign_epi8(y1, qx[2])), + _mm256_maddubs_epi16(s3, _mm256_sign_epi8(y2, qx[3]))); + // 0,0, 1,1, 0,0, 1,1 as int32_t + sumi1 = _mm256_madd_epi16(m1, sumi1); + // 2,2, 3,3, 2,2, 3,3 as int32_t + sumi2 = _mm256_madd_epi16(m1, sumi2); + // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3 as int16_t + auto sumi = _mm256_packs_epi32(sumi1, sumi2); +#endif + sumi = _mm256_madd_epi16(scales, sumi); + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[4*iy+k]), _mm256_cvtepi32_ps(sumi), acc[iy]); + } + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto sumf = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1)); + info.store(ix, iy, _mm_mul_ps(d1, sumf)); + acc[iy] = _mm256_setzero_ps(); + } + } +} + #ifdef HAVE_FANCY_SIMD template <int nrc_y> static void mul_mat_q4_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { @@ -9081,6 +9179,21 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { #endif expected_typeB = GGML_TYPE_Q8_1_X4; break; + case GGML_TYPE_IQ1_M_R4: + assert (ne00 % QK4_NL == 0); + mm.funcs[0] = mul_mat_iq1_m_r4_q8_0<1>; + mm.funcs[1] = mul_mat_iq1_m_r4_q8_0<2>; + mm.funcs[2] = mul_mat_iq1_m_r4_q8_0<3>; + mm.funcs[3] = mul_mat_iq1_m_r4_q8_0<4>; + mm.funcs[4] = mul_mat_iq1_m_r4_q8_0<5>; + mm.funcs[5] = mul_mat_iq1_m_r4_q8_0<6>; + mm.funcs[6] = mul_mat_iq1_m_r4_q8_0<7>; + mm.funcs[7] = mul_mat_iq1_m_r4_q8_0<8>; +#ifdef HAVE_FANCY_SIMD + mm.func16 = mul_mat_iq1_m_r4_q8_0<16>; +#endif + expected_typeB = GGML_TYPE_Q8_0_X4; + break; default: return false; @@ -12093,6 +12206,85 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI } template <int nrc_y> +static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(nrc_x%4 == 0); + Q8<nrc_y, block_q8_0_x4> q8(info); + int nb = n / 32; + GGML_ASSERT(nb%4 == 0); + int8x16_t qx[8]; + int32x4_t acc[nrc_y] = {}; + auto shuffle0 = uint32x4_t{0x00000000, 0x01010101, 0x02020202, 0x03030303}; + auto step = vdupq_n_u8(4); + auto ms = vdupq_n_u8(0x08); + auto mask = vdupq_n_s8(0x18); + float d8[4*nrc_y]; + for (int ix= 0; ix < nrc_x; ix += 4) { + auto dptr = (const ggml_half *)((const char *)vx + ix*bx); + auto d1 = vmulq_f32(vdupq_n_f32(0.125f), vcvt_f32_f16(vld1_f16((const float16_t *)dptr))); + auto x = (const block_iq1_m_r4 *)(dptr + 4); + for (int ib = 0; ib < nb/4; ++ib) { + for (int iy = 0; iy < nrc_y; ++iy) { + auto scales = vld1_f16((const float16_t *)q8.y[iy][ib].d); + vst1q_f32(d8+4*iy, vcvt_f32_f16(scales)); + } + for (int k = 0; k < 4; ++k) { + auto scales4 = vdup_n_u32(((const uint32_t *)x[4*ib+k].scales)[0]); + scales4 = vand_u8(vshl_u32(scales4, int32x2_t{0, -4}), vdup_n_u8(0xf)); + auto scales16 = vmovl_u8(scales4); + auto scales1 = vmovl_u16(vget_low_u16(scales16)); + auto scales2 = vmovl_u16(vget_high_u16(scales16)); + auto qh = (const uint32_t *)x[4*ib+k].qh; + auto idxh = uint32x4_t{qh[0], qh[0] >> 4, qh[1], qh[1] >> 4}; + auto signs = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(idxh, ms), ms), vdupq_n_u8(1))); + signs = vaddq_s8(signs, vdupq_n_s8(-8)); + qx[0] = vreinterpretq_s8_u32(uint32x4_t{iq1s_grid_us[x[4*ib+k].qs[ 0] | ((x[4*ib+k].qh[0] << 8) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[ 1] | ((x[4*ib+k].qh[1] << 8) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[ 2] | ((x[4*ib+k].qh[2] << 8) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[ 3] | ((x[4*ib+k].qh[3] << 8) & 0x0700)]}); + qx[2] = vreinterpretq_s8_u32(uint32x4_t{iq1s_grid_us[x[4*ib+k].qs[ 4] | ((x[4*ib+k].qh[0] << 4) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[ 5] | ((x[4*ib+k].qh[1] << 4) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[ 6] | ((x[4*ib+k].qh[2] << 4) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[ 7] | ((x[4*ib+k].qh[3] << 4) & 0x0700)]}); + qx[4] = vreinterpretq_s8_u32(uint32x4_t{iq1s_grid_us[x[4*ib+k].qs[ 8] | ((x[4*ib+k].qh[4] << 8) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[ 9] | ((x[4*ib+k].qh[5] << 8) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[10] | ((x[4*ib+k].qh[6] << 8) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[11] | ((x[4*ib+k].qh[7] << 8) & 0x0700)]}); + qx[6] = vreinterpretq_s8_u32(uint32x4_t{iq1s_grid_us[x[4*ib+k].qs[12] | ((x[4*ib+k].qh[4] << 4) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[13] | ((x[4*ib+k].qh[5] << 4) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[14] | ((x[4*ib+k].qh[6] << 4) & 0x0700)], + iq1s_grid_us[x[4*ib+k].qs[15] | ((x[4*ib+k].qh[7] << 4) & 0x0700)]}); + auto shuffle = shuffle0; + for (int j = 0; j < 4; ++j) { + auto s = vqtbl1q_s8(signs, shuffle); + qx[2*j+1] = vaddq_s8(s, vandq_s8(vshrq_n_s8(qx[2*j+0], 1), mask)); + qx[2*j+0] = vaddq_s8(s, vandq_s8(vshlq_n_s8(qx[2*j+0], 3), mask)); + shuffle = vaddq_u8(shuffle, step); + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto y = vld1q_s8_x2(q8.y[iy][ib].qs + 32*k); + auto sumi1 = vdupq_n_s32(0); + auto sumi2 = vdupq_n_s32(0); + sumi1 = vdotq_laneq_s32(sumi1, vreinterpretq_s8_u8(qx[0]), y.val[0], 0); + sumi1 = vdotq_laneq_s32(sumi1, vreinterpretq_s8_u8(qx[1]), y.val[0], 1); + sumi1 = vdotq_laneq_s32(sumi1, vreinterpretq_s8_u8(qx[2]), y.val[0], 2); + sumi1 = vdotq_laneq_s32(sumi1, vreinterpretq_s8_u8(qx[3]), y.val[0], 3); + sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[4]), y.val[1], 0); + sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[5]), y.val[1], 1); + sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[6]), y.val[1], 2); + sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[7]), y.val[1], 3); + auto sumi = vmlaq_s32(vmlaq_s32(vdupq_n_s32(0), sumi1, scales1), sumi2, scales2); + acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[4*iy+k]), vcvtq_f32_s32(sumi)); + } + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, vmulq_f32(d1, acc[iy])); + acc[iy] = vdupq_n_f32(0.f); + } + } +} + +template <int nrc_y> static void mul_mat_iq2_s_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); Q8<nrc_y, block_q8_K> q8(info); @@ -13717,6 +13909,11 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) { m.func16 = mul_mat_iq1_s_r4_q8_1<16>; expected_Btype = GGML_TYPE_Q8_1_X4; break; + case GGML_TYPE_IQ1_M_R4: + SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq1_m_r4_q8_0); + m.func16 = mul_mat_iq1_m_r4_q8_0<16>; + expected_Btype = GGML_TYPE_Q8_0_X4; + break; case GGML_TYPE_IQ3_XXS_R4: SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq3_xxs_r4_q8_k); m.func16 = mul_mat_iq3_xxs_r4_q8_k<16>; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index a8553b43..e741a8ea 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -6193,6 +6193,123 @@ void vec_dot_iq1_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t GGML_UNUSED(by); } +void quantize_row_iq1_m_r4_ref(const float * x, block_iq1_m_r4 * y, int64_t k) { + quantize_iq1_m_r4(x, y, 4, k/4, nullptr); +} + +void quantize_row_iq1_m_r4(const float * x, void * y, int64_t k) { + quantize_iq1_m_r4(x, y, 4, k/4, nullptr); +} + +size_t quantize_iq1_m_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + constexpr int kBlockSize = 32; + GGML_ASSERT(nrows%4 == 0); + GGML_ASSERT(n_per_row%kBlockSize == 0); + int nblock = n_per_row/kBlockSize; + float weight[kBlockSize]; + int8_t L[kBlockSize]; + float pairs[2*kBlockSize]; + float max[4]; + uint16_t index[4]; + int shift1, shift2; + float invd[4]; + const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88}; + std::vector<float> scales(8*nblock); + auto row_size = ggml_row_size(GGML_TYPE_IQ1_M_R4, n_per_row); + char * cy = (char *)dst; + for (int row = 0; row < nrows; row += 4) { + ggml_half * dptr = (ggml_half *)cy; + auto y = (block_iq1_m_r4 *)(dptr + 4); + for (int k = 0; k < 4; ++k) max[k] = 0; + for (int ibl = 0; ibl < nblock; ++ibl) { + for (int k = 0; k < 4; ++k) { + auto xb = src + k*n_per_row + kBlockSize*ibl; + float sumx2 = 0; + for (int j = 0; j < kBlockSize; ++j) sumx2 += xb[j]*xb[j]; + if (!sumx2) { + scales[8*ibl+2*k+0] = scales[8*ibl+2*k+1] = 0; + continue; + } + float sigma2 = 1.5f*sumx2/kBlockSize; + if (imatrix) { + for (int j = 0; j < kBlockSize; ++j) weight[j] = imatrix[kBlockSize*ibl + j]*sqrt(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = sqrt(sigma2 + xb[j]*xb[j]); + } + iq1m_process_1block(xb+ 0, weight+ 0, L, scales.data() + 8*ibl + 2*k+0, index+0, &shift1, pairs); + iq1m_process_1block(xb+16, weight+16, L, scales.data() + 8*ibl + 2*k+1, index+2, &shift2, pairs); + max[k] = std::max(max[k], std::max(scales[8*ibl+2*k+0], scales[8*ibl+2*k+1])); + for (int i = 0; i < 4; ++i) { + y[ibl].qs[4*i + k] = index[i] & 255; + } + for (int i = 0; i < 2; ++i) { + y[ibl].qh[4*i+k] = (index[2*i+0] >> 8) | ((index[2*i+1] >> 8) << 4); + } + y[ibl].qh[0+k] |= masks[shift1]; + y[ibl].qh[4+k] |= masks[shift2]; + } + } + for (int k = 0; k < 4; ++k) { + dptr[k] = GGML_FP32_TO_FP16(1.0625f*max[k]/15);; + invd[k] = max[k] ? 15/max[k] : 0.f; + } + for (int ibl = 0; ibl < nblock; ++ibl) { + for (int k = 0; k < 4; ++k) { + int ls1 = nearest_int(scales[8*ibl+2*k+0]*invd[k]); + int ls2 = nearest_int(scales[8*ibl+2*k+1]*invd[k]); + ls1 = std::max(0, std::min(15, ls1)); + ls2 = std::max(0, std::min(15, ls2)); + y[ibl].scales[k] = ls1 | (ls2 << 4); + } + } + cy += 4*row_size; + src += 4*n_per_row; + } + return nrows*row_size; +} + +void dequantize_row_iq1_m_r4(const block_iq1_m_r4 * x, float * y, int64_t n) { + auto dptr = (const ggml_half *)x; + x = (const block_iq1_m_r4 *)(dptr + 4); + float d[4]; + for (int k = 0; k < 4; ++k) d[k] = GGML_FP16_TO_FP32(dptr[k]); + int n_per_row = n/4; + GGML_ASSERT(n_per_row%32 == 0); + int nblock = n_per_row/32; + float dl[2]; + float * yk[4]; + for (int k = 0; k < 4; ++k) yk[k] = y + k*n_per_row; + for (int ib = 0; ib < nblock; ++ib) { + for (int k = 0; k < 4; ++k) { + dl[0] = d[k]*(x[ib].scales[k] & 0xf); + dl[1] = d[k]*(x[ib].scales[k] >> 4); + for (int i = 0; i < 2; ++i) { + auto idx1 = x[ib].qs[8*i+k+0] | ((x[ib].qh[4*i+k] & 0x07) << 8); + auto idx2 = x[ib].qs[8*i+k+4] | ((x[ib].qh[4*i+k] & 0x70) << 4); + auto grid1 = (const int8_t *)(iq1s_grid + idx1); + auto grid2 = (const int8_t *)(iq1s_grid + idx2); + auto delta1 = x[ib].qh[4*i+k] & 0x08 ? -IQ1M_DELTA : IQ1M_DELTA; + auto delta2 = x[ib].qh[4*i+k] & 0x80 ? -IQ1M_DELTA : IQ1M_DELTA; + for (int j = 0; j < 8; ++j) yk[k][32*ib + 16*i + j + 0] = dl[i]*(grid1[j] + delta1); + for (int j = 0; j < 8; ++j) yk[k][32*ib + 16*i + j + 8] = dl[i]*(grid2[j] + delta2); + } + } + } +} + +void vec_dot_iq1_m_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ1_M_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { + return; + } +#endif + GGML_ASSERT(n%QK4_NL == 0); + GGML_ASSERT(nrc == 1); + GGML_UNUSED(bs); + GGML_UNUSED(bx); + GGML_UNUSED(by); +} + //================================================ namespace { diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 9a3c5dc6..0dbb88bd 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -205,6 +205,12 @@ size_t quantize_iq1_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT d void dequantize_row_iq1_s_r4(const block_iq1_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq1_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_iq1_m_r4_ref(const float * GGML_RESTRICT x, block_iq1_m_r4 * GGML_RESTRICT y, int64_t k); +void quantize_row_iq1_m_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_iq1_m_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_iq1_m_r4(const block_iq1_m_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_iq1_m_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void quantize_row_q8_k_r8_ref(const float * GGML_RESTRICT x, block_q8_k_r8 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_k_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/include/llama.h b/include/llama.h index 0f6d15ac..3f25b296 100644 --- a/include/llama.h +++ b/include/llama.h @@ -197,6 +197,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ3_S_R4 = 226, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M_R4 = 229, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS_R4 = 230, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_M_R4 = 231, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_0_R4 = 335, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16_R16 = 232, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_BN_R4 = 337, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 943b945a..117f59be 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3955,6 +3955,7 @@ struct llama_model_loader { case GGML_TYPE_IQ3_XXS_R4: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; case GGML_TYPE_IQ1_S_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ1_S_R4;break; + case GGML_TYPE_IQ1_M_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ1_M_R4;break; case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break; case GGML_TYPE_IQ2_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN; break; @@ -4690,6 +4691,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: return "IQ3_XXS_R4 - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S_R4: return "IQ1_S_R4 - 1.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_M_R4: return "IQ1_M_R4 - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL_R4:return "IQ4_NL_R4 - 4.5 bpw"; @@ -15969,7 +15971,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) { new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) { @@ -15991,7 +15994,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { + ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) { new_type = GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) { @@ -16068,7 +16071,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_BF16; } } - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4 || qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ4_K_R4; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_K_R4; @@ -16134,7 +16137,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) new_type = GGML_TYPE_IQ2_K_R4; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m) new_type = GGML_TYPE_IQ3_S; } } @@ -16580,6 +16582,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: default_type = GGML_TYPE_IQ3_XXS_R4; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_S_R4:default_type = GGML_TYPE_IQ1_S_R4;break; + case LLAMA_FTYPE_MOSTLY_IQ1_M_R4:default_type = GGML_TYPE_IQ1_M_R4;break; case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ1_BN: default_type = GGML_TYPE_IQ1_BN; break; case LLAMA_FTYPE_MOSTLY_IQ2_BN: default_type = GGML_TYPE_IQ2_BN; break; @@ -16934,6 +16937,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_IQ2_S_R4|| new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ1_S_R4|| + new_type == GGML_TYPE_IQ1_M_R4|| (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) { LLAMA_LOG_ERROR("\n\n============================================================\n"); @@ -17057,6 +17061,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S; else chunk_size_multiplier = 4; } + else if (new_type == GGML_TYPE_IQ1_M_R4) { + if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_M; + else chunk_size_multiplier = 4; + } else if (new_type == GGML_TYPE_BF16_R16) { if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16; else chunk_size_multiplier = 16; |