diff options
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index ca5e008a..78b25525 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -3419,6 +3419,250 @@ void vec_dot_iq4_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t b } namespace { +static void quantize_row_iq5_ks_impl(const int super_block_size, const int block_size, + int n_per_row, const float * x, char * cy, + float * all_scales, float * weight, + const int8_t * values, + const float * quant_weights, + const int ntry) { + + float * dptr = (float *)cy; + dptr[0] = 0; + block_iq5_ks * y = (block_iq5_ks *)(dptr + 1); + + const int8_t * shifted_values = values + 32; + + float amax_scale = 0; + + for (int ibl = 0; ibl < n_per_row/super_block_size; ++ibl) { + memset(&y[ibl], 0, sizeof(block_iq5_ks)); + const float * xbl = x + ibl*super_block_size; + auto scales = all_scales + ibl*(super_block_size/block_size); + float sigma2 = 0; + for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j]; + sigma2 *= 2.f/super_block_size; + for (int ib = 0; ib < super_block_size/block_size; ++ib) { + const float * xb = xbl + ib*block_size; + if (quant_weights) { + const float * qw = quant_weights + ibl*super_block_size + ib*block_size; + for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < block_size; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (amax < 1e-15f) { + scales[ib] = 0; + continue; + } + float d = ntry > 0 ? -max/values[0] : max/values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + for (int j = 0; j < block_size; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq5nl(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq5nl(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + d = sumqx_p/sumq2_p; + bool is_shifted = false; + float best = d*sumqx_p; + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry + values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < block_size; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq5nl(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq5nl(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false; + } + id = (itry + shifted_values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < block_size; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq5nl(shifted_values, al); + float q = shifted_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq5nl(shifted_values, -al); + q = shifted_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true; + } + } + if (is_shifted) y[ibl].scales[ib] = 0x01; + scales[ib] = d; + amax_scale = std::max(amax_scale, std::abs(d)); + } + } + float d = amax_scale/127; + *dptr = d; + if (!d) return; + float id = d ? 1/d : 0.f; + float sumqx = 0, sumq2 = 0; + for (int ibl = 0; ibl < n_per_row/super_block_size; ++ibl) { + const float * xbl = x + ibl*super_block_size; + float sigma2 = 0; + for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j]; + sigma2 *= 2.f/super_block_size; + auto scales = all_scales + (super_block_size/block_size)*ibl; + for (int ib = 0; ib < super_block_size/block_size; ++ib) { + const int8_t * block_values = y[ibl].scales[ib] & 0x01 ? shifted_values : values; + int l = nearest_int(0.5f*(id*scales[ib]+127.f)); + l = std::max(0, std::min(127, l)) << 1; + y[ibl].scales[ib] |= l; + l -= 127; + float dl = d * l; + float idl = dl ? 1/dl : 0.f; + const float * xb = xbl + ib*block_size; + if (quant_weights) { + const float * qw = quant_weights + ibl*super_block_size + ib*block_size; + for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j]; + } + for (int j = 0; j < block_size; ++j) { + uint8_t idx = best_index_iq5nl(block_values, idl*xb[j]); + y[ibl].qs[block_size*(ib/2) + j] |= ((idx & 0xf) << 4*(ib%2)); + y[ibl].qh[j] |= ((idx >> 4) << ib); + float w = weight[j]; + float q = block_values[idx]*l; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + } + } + if (sumq2 > 0) *dptr = sumqx/sumq2; +} +} + +void quantize_row_iq5_ks_ref(const float * x, block_iq5_ks * y, int64_t k) { + quantize_iq5_ks(x, (void *)y, 1, k, nullptr); +} + +void quantize_row_iq5_ks(const float * x, void * y, int64_t k) { + quantize_iq5_ks(x, (void *)y, 1, k, nullptr); +} + +size_t quantize_iq5_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + constexpr int kBlockSize = 32; + GGML_ASSERT(n_per_row%QK_K == 0); + auto row_size = ggml_row_size(GGML_TYPE_IQ5_KS, n_per_row); + char * qrow = (char *)dst; + float weight[kBlockSize]; + std::vector<float> all_scales(n_per_row/kBlockSize); + for (int64_t row = 0; row < nrows; ++row) { + quantize_row_iq5_ks_impl(QK_K, kBlockSize, n_per_row, src, qrow, all_scales.data(), weight, iq5nl_values, imatrix, 5); + src += n_per_row; + qrow += row_size; + } + return nrows * row_size; +} + +void dequantize_row_iq5_ks(const block_iq5_ks * x, float * y, int64_t k) { + constexpr int kBlockSize = 32; + GGML_ASSERT(k%QK_K == 0); + const float * dptr = (const float *)x; + float d = *dptr; + x = (const block_iq5_ks *)(dptr + 1); + int nblock = k/QK_K; + for (int ibl = 0; ibl < nblock; ++ibl) { + auto qs = x[ibl].qs; + auto qh = x[ibl].qh; + for (int ib64 = 0; ib64 < QK_K/(2*kBlockSize); ++ib64) { + float dl1 = d * ((int)(x[ibl].scales[2*ib64+0] & 254) - 127); + float dl2 = d * ((int)(x[ibl].scales[2*ib64+1] & 254) - 127); + const int8_t * values1 = iq5nl_values + ((x[ibl].scales[2*ib64+0] & 1) << 5); + const int8_t * values2 = iq5nl_values + ((x[ibl].scales[2*ib64+1] & 1) << 5); + for (int j = 0; j < kBlockSize; ++j) { + y[j ] = dl1 * values1[(qs[j] & 0xf) | (((qh[j] >> (2*ib64+0)) & 1) << 4)]; + y[j+kBlockSize] = dl2 * values2[(qs[j] >> 4) | (((qh[j] >> (2*ib64+1)) & 1) << 4)]; + } + y += 2*kBlockSize; + qs += kBlockSize; + } + } +} + +void vec_dot_iq5_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { + constexpr int kBlockSize = 32; +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ5_KS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { + return; + } +#endif + GGML_ASSERT(n%QK_K == 0); + GGML_ASSERT(nrc == 1); + GGML_UNUSED(bs); + GGML_UNUSED(bx); + GGML_UNUSED(by); + const float * dptr = (const float *)vx; + const float d = *dptr; + const block_iq5_ks * x = (const block_iq5_ks *)(dptr + 1); + const block_q8_K * y = (const block_q8_K *)vy; + int nblock = n/QK_K; + float sumf = 0; + for (int ibl = 0; ibl < nblock; ++ibl) { + auto qy = y[ibl].qs; + auto qs = x[ibl].qs; + auto qh = x[ibl].qh; + float db = d * y[ibl].d; + for (int ib64 = 0; ib64 < QK_K/(2*kBlockSize); ++ib64) { + float dl1 = db * ((int)(x[ibl].scales[2*ib64+0] & 254) - 127); + float dl2 = db * ((int)(x[ibl].scales[2*ib64+1] & 254) - 127); + const int8_t * values1 = iq5nl_values + ((x[ibl].scales[2*ib64+0] & 1) << 5); + const int8_t * values2 = iq5nl_values + ((x[ibl].scales[2*ib64+1] & 1) << 5); + int suml1 = 0; + int suml2 = 0; + for (int j = 0; j < kBlockSize; ++j) { + suml1 += qy[j ] * values1[(qs[j] & 0xf) | (((qh[j] >> (2*ib64+0)) & 1) << 4)]; + suml2 += qy[j+kBlockSize] * values2[(qs[j] >> 4) | (((qh[j] >> (2*ib64+1)) & 1) << 4)]; + } + sumf += dl1*suml1 + dl2*suml2; + y += 2*kBlockSize; + qs += kBlockSize; + } + } + *s = sumf; +} + +namespace { const uint16_t * scramble_table() { static std::mutex mutex; static std::vector<uint16_t> table; |