diff options
Diffstat (limited to 'ggml/src/ggml-quants.c')
-rw-r--r-- | ggml/src/ggml-quants.c | 289 |
1 files changed, 203 insertions, 86 deletions
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 391d9e2e..3c4711f3 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -13991,6 +13991,105 @@ static int iq1_sort_helper(const void * left, const void * right) { return *l < *r ? -1 : *l > *r ? 1 : 0; } +void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L, float * the_scale, uint16_t * the_index, int * the_shift, + float * pairs, float * sumx, float * sumw) { + float max = fabsf(xb[0]); + for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); + if (max < GROUP_MAX_EPS_IQ1_S) { + *the_scale = 0; + *the_shift = 1; + for (int k = 0; k < block_size/8; ++k) the_index[k] = 0; + return; + } + const int gindex = iq2_data_index(GGML_TYPE_IQ1_S); + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); + + const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA}; + const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA}; + + // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. + // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two + // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights + // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and + // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale + // for each possible and score for each split. + int * idx = (int *)(pairs + 1); + for (int j = 0; j < block_size; ++j) { + pairs[2*j] = xb[j]; + idx[2*j] = j; + } + qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); + { + sumx[0] = sumw[0] = 0; + for (int j = 0; j < block_size; ++j) { + int i = idx[2*j]; + sumx[j+1] = sumx[j] + weight[i]*xb[i]; + sumw[j+1] = sumw[j] + weight[i]; + } + } + float best_score = -FLT_MIN, scale = max; + int besti1 = -1, besti2 = -1, best_shift = 0; + for (int i1 = 0; i1 <= block_size; ++i1) { + for (int i2 = i1; i2 <= block_size; ++i2) { + float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; + float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; + if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + scale = sumqx/sumq2; best_score = scale*sumqx; + besti1 = i1; besti2 = i2; best_shift = 1; + } + sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; + sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; + if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { + scale = sumqx/sumq2; best_score = scale*sumqx; + besti1 = i1; besti2 = i2; best_shift = -1; + } + } + } + GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); + for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; + for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; + for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; + if (scale < 0) { + for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; + scale = -scale; best_shift = -best_shift; + } + bool all_on_grid = true; + const float * xx = best_shift == 1 ? x_p : x_m; + for (int k = 0; k < block_size/8; ++k) { + uint16_t u = 0; + for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + all_on_grid = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); + GGML_ASSERT(grid_index >= 0); + } + the_index[k] = grid_index; + } + if (!all_on_grid) { + float sumqx = 0, sumq2 = 0; + for (int k = 0; k < block_size/8; ++k) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + the_index[k]); + for (int j = 0; j < 8; ++j) { + float w = weight[8*k + j]; + float q = xx[(pg[j] - 1)/2]; + sumqx += w*q*xb[8*k+j]; + sumq2 += w*q*q; + } + } + if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; + } + *the_scale = scale; + *the_shift = best_shift; +} + #define IQ1S_BLOCK_SIZE 32 #define IQ1M_BLOCK_SIZE 16 static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, @@ -14021,11 +14120,10 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy const int block_size = IQ1S_BLOCK_SIZE; - const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA}; - const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA}; - + //const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA}; + //const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA}; - int * idx = (int *)(pairs + 1); + //int * idx = (int *)(pairs + 1); for (int ibl = 0; ibl < nbl; ++ibl) { @@ -14044,95 +14142,100 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy const float * xb = xbl + block_size*ib; const float * qw = quant_weights + QK_K*ibl + block_size*ib; for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); - float max = fabsf(xb[0]); - for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); - if (max < GROUP_MAX_EPS_IQ1_S) { - scales[ib] = 0; - memset(L, 1, block_size); - continue; - } - // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. - // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two - // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights - // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and - // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale - // for each possible and score for each split. - for (int j = 0; j < block_size; ++j) { - pairs[2*j] = xb[j]; - idx[2*j] = j; - } - qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); - { - sumx[0] = sumw[0] = 0; - for (int j = 0; j < block_size; ++j) { - int i = idx[2*j]; - sumx[j+1] = sumx[j] + weight[i]*xb[i]; - sumw[j+1] = sumw[j] + weight[i]; - } - } - float best_score = -FLT_MIN, scale = max; - int besti1 = -1, besti2 = -1, best_shift = 0; - for (int i1 = 0; i1 <= block_size; ++i1) { - for (int i2 = i1; i2 <= block_size; ++i2) { - float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; - float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; - if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { - scale = sumqx/sumq2; best_score = scale*sumqx; - besti1 = i1; besti2 = i2; best_shift = 1; - } - sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; - sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; - if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { - scale = sumqx/sumq2; best_score = scale*sumqx; - besti1 = i1; besti2 = i2; best_shift = -1; - } - } - } - GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); - for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; - for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; - for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; - if (scale < 0) { - for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; - scale = -scale; best_shift = -best_shift; - } - bool all_on_grid = true; - const float * xx = best_shift == 1 ? x_p : x_m; - for (int k = 0; k < block_size/8; ++k) { - uint16_t u = 0; - for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); - int grid_index = kmap_q2xs[u]; - if (grid_index < 0) { - all_on_grid = false; - const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; - grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); - GGML_ASSERT(grid_index >= 0); - } - index[k] = grid_index; - } - if (!all_on_grid) { - float sumqx = 0, sumq2 = 0; - for (int k = 0; k < block_size/8; ++k) { - const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); - for (int j = 0; j < 8; ++j) { - float w = weight[8*k + j]; - float q = xx[(pg[j] - 1)/2]; - sumqx += w*q*xb[8*k+j]; - sumq2 += w*q*q; - } - } - if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; - } + int best_shift; + iq1s_process_1block(block_size, xb, weight, L, &scales[ib], index, &best_shift, pairs, sumx, sumw); + +// float max = fabsf(xb[0]); +// for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); +// if (max < GROUP_MAX_EPS_IQ1_S) { +// scales[ib] = 0; +// memset(L, 1, block_size); +// continue; +// } +// // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. +// // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two +// // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights +// // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and +// // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale +// // for each possible and score for each split. +// for (int j = 0; j < block_size; ++j) { +// pairs[2*j] = xb[j]; +// idx[2*j] = j; +// } +// qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); +// { +// sumx[0] = sumw[0] = 0; +// for (int j = 0; j < block_size; ++j) { +// int i = idx[2*j]; +// sumx[j+1] = sumx[j] + weight[i]*xb[i]; +// sumw[j+1] = sumw[j] + weight[i]; +// } +// } +// float best_score = -FLT_MIN, scale = max; +// int besti1 = -1, besti2 = -1, best_shift = 0; +// for (int i1 = 0; i1 <= block_size; ++i1) { +// for (int i2 = i1; i2 <= block_size; ++i2) { +// float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; +// float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; +// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { +// scale = sumqx/sumq2; best_score = scale*sumqx; +// besti1 = i1; besti2 = i2; best_shift = 1; +// } +// sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; +// sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; +// if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { +// scale = sumqx/sumq2; best_score = scale*sumqx; +// besti1 = i1; besti2 = i2; best_shift = -1; +// } +// } +// } +// GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); +// for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; +// for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; +// for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; +// if (scale < 0) { +// for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; +// scale = -scale; best_shift = -best_shift; +// } +// bool all_on_grid = true; +// const float * xx = best_shift == 1 ? x_p : x_m; +// for (int k = 0; k < block_size/8; ++k) { +// uint16_t u = 0; +// for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); +// int grid_index = kmap_q2xs[u]; +// if (grid_index < 0) { +// all_on_grid = false; +// const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; +// grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); +// GGML_ASSERT(grid_index >= 0); +// } +// index[k] = grid_index; +// } +// if (!all_on_grid) { +// float sumqx = 0, sumq2 = 0; +// for (int k = 0; k < block_size/8; ++k) { +// const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); +// for (int j = 0; j < 8; ++j) { +// float w = weight[8*k + j]; +// float q = xx[(pg[j] - 1)/2]; +// sumqx += w*q*xb[8*k+j]; +// sumq2 += w*q*q; +// } +// } +// if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; +// } uint16_t h = 0; for (int k = 0; k < block_size/8; ++k) { y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255; h |= (index[k] >> 8) << 3*k; } y[ibl].qh[ib] = h; - GGML_ASSERT(scale >= 0); - scales[ib] = scale; + GGML_ASSERT(scales[ib] >= 0); + max_scale = MAX(max_scale, scales[ib]); + //GGML_ASSERT(scale >= 0); + //scales[ib] = scale; shifts[ib] = best_shift; - max_scale = MAX(max_scale, scale); + //max_scale = MAX(max_scale, scale); } if (!max_scale) { @@ -14171,6 +14274,19 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n return nrow * nblock * sizeof(block_iq1_s); } +void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k) { + int nblock = k/QK_K; + float qw[QK_K]; + for (int j = 0; j < QK_K; ++j) qw[j] = 1; + for (int ibl = 0; ibl < nblock; ++ibl) { + quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw); + } +} + +void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_iq1_s_ref(x, (block_iq1_s *)y, k); +} + static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, float * scales, float * weight, @@ -15129,6 +15245,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ3_XXS_R4: break; case GGML_TYPE_IQ3_S_R4: break; case GGML_TYPE_IQ2_S_R4: break; + case GGML_TYPE_IQ1_S_R4: break; case GGML_TYPE_Q4_0_R4: break; case GGML_TYPE_Q5_0_R4: break; case GGML_TYPE_Q6_0_R4: break; |