diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-07-14 18:55:08 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-14 18:55:08 +0200 |
commit | 45fae1a14444622478774f9a417e1d417af1ca46 (patch) | |
tree | 2609ef06be5640749834d4fc691446771ab29f42 /examples/quantize-stats | |
parent | f5353047ef461e6fc9d527e09a06c9802c699929 (diff) |
Adding IQ2_KL (#602)
* Experiments for 2.6875 bpw quants
At least according to rmse, this is significantly better than
q2_K, while using only 1/16 more bits per weight.
* iq2_kl: basics
* iq2_kl: CUDA dequantize
* iq2_kl: small improvement in PPL
Also check the two neighbouring values for the block scale
and use the one that minimizes RMSE.
* iq2_kl: MMQ
Quite good: PP-512(L3-8B) = 8472 t/s.
* iq2_kl: MMVQ
We get PP-128(L3-8B) = 162 t/s.
Which means that this is not quite as good as it should be as
(almost) same bpq q2_K is at 170 t/s.
* iq2_kl: Zen4 GEMM/GEMV
Not particularly fast. I may need to think about rearranging the bits.
* iq2_kl: better Zen4
* iq2_kl: convert/repack to q8_k_r8 (AVX2)
* iq2_kl: AVX2 GEMM/GEMV
* iq2_kl: WIP NEON
The compiler started crashing!!!
* iq2_kl: NEON
Had to work around a compiler crash when using vzip2q_u8 using
vqtbl2q_u8.
* iq2_kl: convert/repack to q8_k_r8 (NEON)
* iq2_kl: Metal dequantize
* iq2_kl: Metal GEMV - pretty slow
* iq2_kl: Metal GEMV - slightly better (40 t/s -> 44.5 t/s)
* iq2_kl: Metal GEMV - slightly better (44.5 t/s -> 46.5 t/s)
* iq2_kl: Metal GEMV - slightly better (46.5 t/s -> 47.2 t/s)
* iq2_kl: slightly better Metal dequantize
PP-512 goes to 476 t/s up from 466 t/s.
* iq2_kl: slightly better Metal dequantize
PP-512 goes to 492 t/s up from 476 t/s.
* Add iq2_kl to constants.py
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'examples/quantize-stats')
-rw-r--r-- | examples/quantize-stats/quantize-stats.cpp | 529 |
1 files changed, 529 insertions, 0 deletions
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index d1598aec..02cfb25d 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -831,6 +831,477 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * sqrt(mse_q/(n_per_row*nrows)), sqrt(tot_mse_q/tot_elements)); } +static const int8_t iq3nl_index[111] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, + 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 11, 11, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 12, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 13, 13, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 14, 14, 7, 7, 7, 7, 7, 7, 7, 7, 7 +}; +static inline int best_index_iq3nl(const int8_t * values, float x) { + int ix = (int)x - values[0]; + if (ix < 0 || ix >= 111) return ix < 0 ? 0 : 7; + ix = iq3nl_index[ix]; + return ix < 8 ? ix : x - values[ix-8] < values[ix-7] - x ? ix-8 : ix-7; +} + +static void analyze_iq2kl(const char * name, int nrows, int n_per_row, const float * x_values, const float * imatrix, float& tot_mse, float& tot_elements) { + constexpr int kBlockSize = 32; + constexpr int ntry = 5; + static const int k_index[64] = {-1, 0, -2, 1, -3, -4, 2, -5, -6, -7, -8, 3, -9, 4, -10, -11, 5, 6, 7, -12, 8, 9, 10, 11, -13, -14, -15, -16, 12, 13, + -17, -18, -19, -20, 14, 15, 16, 17, 18, -21, 19, 20, 21, 22, 23, 24, -22, 25, -23, -24, 26, -25, 27, -26, 28, -27, -28, 29, -29, 30, -30, -31, 31, -32,}; + static const std::vector<std::vector<int>> k_neighbours = { + { 0, 5, 6, 1, 7, 3, 8, 14, }, + { 1, 0, 3, 7, 4, 6, 8, 2, }, + { 1, 3, 4, 2, 8, 0, 9, 7, }, + { 2, 1, 4, 3, 9, 8, 10, 11, }, + { 2, 11, 4, 10, 9, 1, 8, 3, }, + { 5, 6, 0, 7, 3, 19, 14, 1, }, + { 6, 0, 7, 5, 3, 1, 8, 14, }, + { 3, 7, 6, 1, 0, 8, 4, 12, }, + { 3, 4, 8, 9, 1, 7, 12, 10, }, + { 4, 10, 9, 2, 11, 8, 13, 3, }, + { 11, 10, 2, 4, 9, 18, 13, 8, }, + { 8, 7, 3, 12, 9, 15, 16, 13, }, + { 5, 19, 6, 20, 14, 7, 21, 15, }, + { 6, 14, 7, 20, 5, 21, 15, 19, }, + { 14, 7, 15, 6, 21, 12, 16, 22, }, + { 12, 15, 16, 8, 14, 7, 13, 22, }, + { 18, 10, 13, 17, 9, 11, 12, 24, }, + { 11, 18, 25, 10, 13, 17, 9, 24, }, + { 19, 5, 20, 6, 14, 21, 7, 26, }, + { 20, 14, 21, 6, 19, 7, 15, 26, }, + { 25, 18, 11, 10, 28, 17, 13, 24, }, + { 18, 24, 28, 25, 17, 23, 13, 16, }, + { 19, 20, 29, 26, 21, 14, 5, 22, }, + { 20, 26, 29, 21, 19, 14, 22, 30, }, + { 27, 26, 22, 23, 30, 21, 15, 24, }, + { 27, 24, 28, 23, 31, 17, 22, 16, }, + { 25, 28, 31, 18, 24, 17, 27, 23, }, + { 29, 19, 20, 26, 21, 30, 14, 22, }, + { 30, 29, 26, 27, 21, 22, 20, 23, }, + { 30, 27, 31, 26, 28, 23, 22, 24, }, + { 31, 27, 30, 28, 24, 23, 26, 22, }, + { 31, 28, 25, 24, 18, 27, 30, 17, }, + }; + //static const int k_index[64] = {-1, -2, -3, 0, -4, -5, -6, -7, -8, 1, -9, -10, -11, 2, 3, -12, -13, -14, 4, 5, 6, 7, 8, -15, 9, -16, 10, 11, 12, 13, 14, + // -17, -18, -19, 15, 16, 17, 18, 19, -20, -21, 20, 21, 22, 23, 24, 25, -22, -23, 26, 27, 28, 29, 30, 31, -24, -25, -26, -27, -28, -29, -30, -31, -32,}; + //static const std::vector<std::vector<int>> k_neighbours = { + // { 1, 4, }, + // { 1, 0, 4, 5, }, + // { 0, 1, 4, 5, 6, }, + // { 0, 2, 6, 3, 5, 7, 4, 8, }, + // { 2, 3, 0, 7, 6, 8, 5, }, + // { 3, 2, 8, 7, 6, }, + // { 3, 2, 8, 7, }, + // { 1, 9, 4, 10, }, + // { 1, 4, 0, 5, 10, 6, 11, 9, }, + // { 0, 5, 4, 6, 1, 2, 11, 7, }, + // { 2, 6, 0, 5, 7, 3, 12, 4, }, + // { 3, 8, 2, 7, 14, 13, }, + // { 9, 1, 4, 10, 15, }, + // { 1, 4, 9, 10, 5, 11, 15, 0, }, + // { 8, 3, 14, 7, 2, 13, 19, 18, }, + // { 9, 10, 4, 15, 1, 11, 20, 5, }, + // { 14, 8, 19, 13, 3, 7, 18, 25, }, + // { 9, 20, 15, 10, 21, 26, 4, 27, }, + // { 15, 20, 9, 10, 21, 16, 26, 4, }, + // { 19, 14, 25, 18, 8, 13, 24, 31, }, + // { 20, 26, 9, 21, 15, 27, 10, }, + // { 25, 19, 31, 24, 14, 18, 30, 13, }, + // { 26, 20, 27, 21, 15, }, + // { 31, 25, 30, 19, 24, 18, }, + // { 26, 20, 27, 21, }, + // { 26, 27, 20, 21, 28, 22, }, + // { 27, 26, 28, 21, 20, 22, 29, 23, }, + // { 28, 27, 29, 22, 21, 23, 26, 30, }, + // { 29, 28, 30, 23, 22, 24, 27, 31, }, + // { 30, 29, 31, 24, 23, 25, 28, 22, }, + // { 31, 30, 25, 24, 29, 23, }, + // { 31, 25, 30, 24, }, + //}; + auto values = iq3nl_values; + std::vector<std::pair<int8_t, int8_t>> grid(32); + for (int j = 0; j < 64; ++j) { + if (int i = k_index[j]; i >= 0) { + int i1 = j/8, i2 = j%8; + grid[i] = {values[i1], values[i2]}; + } + } + auto index = [&grid, values] (float id, float x1, float x2, float w1, float w2) { + float sx1 = id*x1; + float sx2 = id*x2; + int l1 = best_index_iq3nl(values, sx1); + int l2 = best_index_iq3nl(values, sx2); + int i = k_index[8*l1 + l2]; + if (i >= 0) return i; + auto& neigh = k_neighbours[-i-1]; + // d*q - x1 = d*(q - x1/d) + float best = std::numeric_limits<float>::max(); + int ibest = -1; + //printf("sx1 = %g, sx2 = %g, l1 = %d, l2 = %d, %d neighbours\n", sx1, sx2, l1, l2, int(neigh.size())); + for (auto& n : neigh) { + //printf(" neigh %d,%d: %d %d\n", grid[n].first, grid[n].second, values[grid[n].first ], values[grid[n].second]); + float diff1 = grid[n].first - sx1; + float diff2 = grid[n].second - sx2; + float score = w1*diff1*diff1 + w2*diff2*diff2; + if (score < best) { + best = score; ibest = n; + } + } + GGML_ASSERT(ibest >= 0); + return ibest; + }; + auto compute_1row = [&] (const float * xr) { + float weight[kBlockSize]; + int nblock = n_per_row/kBlockSize; + int last_ibl = -1; + float sigma2 = 0; + float mse = 0, sum_x2 = 0; + for (int ib = 0; ib < nblock; ++ib) { + auto xb = xr + ib*kBlockSize; + int ibl = ib/8; + if (ibl != last_ibl) { + int n = std::min(256, n_per_row - ib*kBlockSize); + float sumx2 = 0; + for (int j = 0; j < n; ++j) sumx2 += xb[j]*xb[j]; + sigma2 = 2*sumx2/n; + last_ibl = ibl; + } + if (imatrix) { + auto qw = imatrix + ib*kBlockSize; + for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j]*sqrt(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = std::abs(xb[j]); //xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < kBlockSize; ++j) { + float ax = std::abs(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (!amax) { + continue; + } + float d = ntry > 0 ? -max/values[0] : max/values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + for (int j = 0; j < kBlockSize; j += 2) { + float w1 = weight[j+0]; + float w2 = weight[j+1]; + int idx = index(id, xb[j+0], xb[j+1], w1, w2); + float q1 = grid[idx].first ; + float q2 = grid[idx].second; + sumqx_p += w1*q1*xb[j] + w2*q2*xb[j+1]; + sumq2_p += w1*q1*q1 + w2*q2*q2; + idx = index(-id, xb[j+0], xb[j+1], w1, w2); + q1 = grid[idx].first ; + q2 = grid[idx].second; + sumqx_m += w1*q1*xb[j] + w2*q2*xb[j+1]; + sumq2_m += w1*q1*q1 + w2*q2*q2; + } + d = sumqx_p/sumq2_p; + float best = d*sumqx_p; + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry + values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < kBlockSize; j += 2) { + float w1 = weight[j+0]; + float w2 = weight[j+1]; + int idx = index(id, xb[j+0], xb[j+1], w1, w2); + float q1 = grid[idx].first ; + float q2 = grid[idx].second; + sumqx_p += w1*q1*xb[j] + w2*q2*xb[j+1]; + sumq2_p += w1*q1*q1 + w2*q2*q2; + idx = index(-id, xb[j+0], xb[j+1], w1, w2); + q1 = grid[idx].first ; + q2 = grid[idx].second; + sumqx_m += w1*q1*xb[j] + w2*q2*xb[j+1]; + sumq2_m += w1*q1*q1 + w2*q2*q2; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; + } + } + id = 1/d; + float block_mse = 0; + for (int j = 0; j < kBlockSize; j += 2) { + int idx = index(id, xb[j+0], xb[j+1], weight[j], weight[j+1]); + float q1 = grid[idx].first ; + float q2 = grid[idx].second; + float diff1 = d*q1 - xb[j+0]; + float diff2 = d*q2 - xb[j+1]; + block_mse += diff1*diff1 + diff2*diff2; + sum_x2 += xb[j+0]*xb[j+0] + xb[j+1]*xb[j+1]; + } + mse += block_mse; + } + return std::make_pair(mse, sum_x2); + }; + std::mutex mutex; + int counter = 0; + float mse = 0, sum_x2 = 0; + auto compute = [&mutex, &counter, &compute_1row, &mse, &sum_x2, x_values, nrows, n_per_row] () { + float local_mse = 0, local_x2 = 0; + while (true) { + std::unique_lock<std::mutex> lock(mutex); + int row = counter++; + if (row >= nrows) { + mse += local_mse; sum_x2 += local_x2; + return; + } + lock.unlock(); + auto [row_mse, row_x2] = compute_1row(x_values + row*n_per_row); + local_mse += row_mse; + local_x2 += row_x2; + } + }; + int nthread = std::thread::hardware_concurrency()/2; + std::vector<std::thread> workers(nthread-1); + for (auto& w : workers) w = std::thread(compute); + compute(); + for (auto& w : workers) w.join(); + //float weight[kBlockSize]; + //int nblock = n_per_row/kBlockSize; + //int last_ibl = -1; + //float sigma2 = 0; + //auto shifted_values = values + 8; + //float mse = 0, sum_x2 = 0; + //for (int row = 0; row < nrows; ++row) { + // auto xr = x_values + row*n_per_row; + // for (int ib = 0; ib < nblock; ++ib) { + // auto xb = xr + ib*kBlockSize; + // int ibl = ib/8; + // if (ibl != last_ibl) { + // int n = std::min(256, n_per_row - ib*kBlockSize); + // float sumx2 = 0; + // for (int j = 0; j < n; ++j) sumx2 += xb[j]*xb[j]; + // sigma2 = 2*sumx2/n; + // last_ibl = ibl; + // } + // if (imatrix) { + // auto qw = imatrix + ib*kBlockSize; + // for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j]*sqrt(sigma2 + xb[j]*xb[j]); + // } else { + // for (int j = 0; j < kBlockSize; ++j) weight[j] = xb[j]*xb[j]; + // } + // float amax = 0, max = 0; + // for (int j = 0; j < kBlockSize; ++j) { + // float ax = std::abs(xb[j]); + // if (ax > amax) { + // amax = ax; max = xb[j]; + // } + // } + // if (!amax) { + // continue; + // } + // float d = ntry > 0 ? -max/values[0] : max/values[0]; + // float id = 1/d; + // float sumqx_p = 0, sumq2_p = 0; + // float sumqx_m = 0, sumq2_m = 0; + // for (int j = 0; j < kBlockSize; j += 2) { + // float w1 = weight[j+0]; + // float w2 = weight[j+1]; + // int idx = index(id, xb[j+0], xb[j+1], w1, w2); + // float q1 = grid[idx].first ; + // float q2 = grid[idx].second; + // sumqx_p += w1*q1*xb[j] + w2*q2*xb[j+1]; + // sumq2_p += w1*q1*q1 + w2*q2*q2; + // idx = index(-id, xb[j+0], xb[j+1], w1, w2); + // q1 = grid[idx].first ; + // q2 = grid[idx].second; + // sumqx_m += w1*q1*xb[j] + w2*q2*xb[j+1]; + // sumq2_m += w1*q1*q1 + w2*q2*q2; + // } + // d = sumqx_p/sumq2_p; + // float best = d*sumqx_p; + // if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + // d = sumqx_m/sumq2_m; best = d*sumqx_m; + // } + // for (int itry = -ntry; itry <= ntry; ++itry) { + // id = (itry + values[0])/max; + // sumqx_p = sumq2_p = 0; + // sumqx_m = sumq2_m = 0; + // for (int j = 0; j < kBlockSize; j += 2) { + // float w1 = weight[j+0]; + // float w2 = weight[j+1]; + // int idx = index(id, xb[j+0], xb[j+1], w1, w2); + // float q1 = grid[idx].first ; + // float q2 = grid[idx].second; + // sumqx_p += w1*q1*xb[j] + w2*q2*xb[j+1]; + // sumq2_p += w1*q1*q1 + w2*q2*q2; + // idx = index(-id, xb[j+0], xb[j+1], w1, w2); + // q1 = grid[idx].first ; + // q2 = grid[idx].second; + // sumqx_m += w1*q1*xb[j] + w2*q2*xb[j+1]; + // sumq2_m += w1*q1*q1 + w2*q2*q2; + // } + // if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + // d = sumqx_p/sumq2_p; best = d * sumqx_p; + // } + // if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + // d = sumqx_m/sumq2_m; best = d * sumqx_m; + // } + // } + // id = 1/d; + // float block_mse = 0; + // for (int j = 0; j < kBlockSize; j += 2) { + // int idx = index(id, xb[j+0], xb[j+1], weight[j], weight[j+1]); + // float q1 = grid[idx].first ; + // float q2 = grid[idx].second; + // float diff1 = d*q1 - xb[j+0]; + // float diff2 = d*q2 - xb[j+1]; + // block_mse += diff1*diff1 + diff2*diff2; + // sum_x2 += xb[j+0]*xb[j+0] + xb[j+1]*xb[j+1]; + // } + // mse += block_mse; + // } + //} + tot_mse += mse; + tot_elements += sum_x2; + printf("%s: %g, %g %g\n", name, sqrt(mse/(nrows*n_per_row)), sqrt(mse/sum_x2), sqrt(tot_mse/tot_elements)); +} + + +static void analyze_iq3ks(const char * name, int nrows, int n_per_row, const float * x_values, const float * imatrix, float& tot_mse, float& tot_elements, + std::vector<int64_t>& Htot) { + constexpr int kBlockSize = 32; + constexpr int ntry = 5; + float weight[kBlockSize]; + int nblock = n_per_row/kBlockSize; + int last_ibl = -1; + float sigma2 = 0; + auto values = iq3nl_values; + auto shifted_values = values + 8; + std::vector<int64_t> H(64, 0); + float mse = 0; + for (int row = 0; row < nrows; ++row) { + auto xr = x_values + row*n_per_row; + for (int ib = 0; ib < nblock; ++ib) { + auto xb = xr + ib*kBlockSize; + int ibl = ib/8; + if (ibl != last_ibl) { + int n = std::min(256, n_per_row - ib*kBlockSize); + float sumx2 = 0; + for (int j = 0; j < n; ++j) sumx2 += xb[j]*xb[j]; + sigma2 = 2*sumx2/n; + last_ibl = ibl; + } + if (imatrix) { + auto qw = imatrix + ib*kBlockSize; + for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j]*sqrt(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < kBlockSize; ++j) { + float ax = std::abs(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (!amax) { + continue; + } + float d = ntry > 0 ? -max/values[0] : max/values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + for (int j = 0; j < kBlockSize; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq3nl(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq3nl(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + d = sumqx_p/sumq2_p; + bool is_shifted = false; + float best = d*sumqx_p; + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry + values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < kBlockSize; ++j) { + float w = weight[j]; + float al = id*xb[j]; + int l = best_index_iq3nl(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + l = best_index_iq3nl(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false; + } + //id = (itry + shifted_values[0])/max; + //sumqx_p = sumq2_p = 0; + //sumqx_m = sumq2_m = 0; + //for (int j = 0; j < kBlockSize; ++j) { + // float w = weight[j]; + // float al = id*xb[j]; + // int l = best_index_iq3nl(shifted_values, al); + // float q = shifted_values[l]; + // sumqx_p += w*q*xb[j]; + // sumq2_p += w*q*q; + // l = best_index_iq3nl(shifted_values, -al); + // q = shifted_values[l]; + // sumqx_m += w*q*xb[j]; + // sumq2_m += w*q*q; + //} + //if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + // d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true; + //} + //if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + // d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true; + //} + } + auto block_values = is_shifted ? shifted_values : values; + id = 1/d; + float block_mse = 0; + for (int j = 0; j < kBlockSize; j += 2) { + int l1 = best_index_iq3nl(block_values, id*xb[j+0]); + int l2 = best_index_iq3nl(block_values, id*xb[j+1]); + float diff1 = d*block_values[l1] - xb[j+0]; + float diff2 = d*block_values[l2] - xb[j+1]; + block_mse += diff1*diff1 + diff2*diff2; + ++H[8*l1+l2]; + } + mse += block_mse; + } + } + tot_mse += mse; + tot_elements += nrows*n_per_row; + printf("%s: %g %f\n", name, sqrt(mse/(nrows*n_per_row)), sqrt(tot_mse/tot_elements)); + + if (Htot.empty()) Htot = std::move(H); + else { + if (Htot.size() != H.size()) printf("Oops: inconsistent H sizes %zu vs %zu\n", H.size(), Htot.size()); + else for (int j = 0; j < (int)H.size(); ++j) Htot[j] += H[j]; + } +} + static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const float * values, float& tot_mse, float& tot_elements) { int row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row); int nblock = n_per_row/QK_K; @@ -929,6 +1400,40 @@ static void analyze_iq4ks(const ggml_tensor * t, float& tot_mse, float& tot_mse_ } } +static void analyze_iq2kl(const ggml_tensor * t, float& tot_mse, float& tot_elements) { + if (!ggml_is_contiguous(t) || (t->type != GGML_TYPE_F32 && t->type != GGML_TYPE_F16 && t->type != GGML_TYPE_BF16)) { + return; + } + if (t->type == GGML_TYPE_F32) { + analyze_iq2kl(t->name, t->ne[1], t->ne[0], (const float *)t->data, nullptr, tot_mse, tot_elements); + } else { + std::vector<float> aux(t->ne[0]*t->ne[1]); + if (t->type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)t->data, aux.data(), aux.size()); + } else { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)t->data, aux.data(), aux.size()); + } + analyze_iq2kl(t->name, t->ne[1], t->ne[0], aux.data(), nullptr, tot_mse, tot_elements); + } +} + +static void analyze_iq3ks(const ggml_tensor * t, float& tot_mse, float& tot_elements, std::vector<int64_t>& Htot) { + if (!ggml_is_contiguous(t) || (t->type != GGML_TYPE_F32 && t->type != GGML_TYPE_F16 && t->type != GGML_TYPE_BF16)) { + return; + } + if (t->type == GGML_TYPE_F32) { + analyze_iq3ks(t->name, t->ne[1], t->ne[0], (const float *)t->data, nullptr, tot_mse, tot_elements, Htot); + } else { + std::vector<float> aux(t->ne[0]*t->ne[1]); + if (t->type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)t->data, aux.data(), aux.size()); + } else { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)t->data, aux.data(), aux.size()); + } + analyze_iq3ks(t->name, t->ne[1], t->ne[0], aux.data(), nullptr, tot_mse, tot_elements, Htot); + } +} + static void print_fp_stats(const char * msg, const uint64_t * counts) { printf("===== %s\n", msg); uint64_t tot = 0; for (int i = 0; i < 32; ++i) tot += counts[i]; @@ -1109,6 +1614,30 @@ int main(int argc, char ** argv) { std::vector<float> output_scratch; if (analyze) { + float tot_mse = 0, tot_elements = 0; + //std::vector<int64_t> Htot; + for (const auto& kv_tensor : tensors) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (kv_tensor.second->ne[0] == 1 || kv_tensor.second->ne[1] == 1) { + // we never quantize those + continue; + } + //analyze_iq3ks(kv_tensor.second, tot_mse, tot_elements, Htot); + analyze_iq2kl(kv_tensor.second, tot_mse, tot_elements); + } + //if (!Htot.empty()) { + // printf("=============================== pair histogram\n"); + // for (int i = 0; i < (int)Htot.size(); ++i) { + // int i1 = i/8, i2 = i%8; + // printf("%d %d %d %g\n", i, i1, i2, 1.*Htot[i]); + // } + //} + return 0; + } + + if (analyze) { float tot_mse = 0, tot_mse_q = 0, tot_elements = 0; for (const auto& kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { |