summaryrefslogtreecommitdiff
path: root/examples/quantize-stats/quantize-stats.cpp
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2024-10-16 15:18:26 +0300
committerGitHub <noreply@github.com>2024-10-16 15:18:26 +0300
commit76b97c80645362ac65a2e33043fd8d46bdaf8c56 (patch)
treeb2b8ab9efb91a6ce4dd9d0fccbc9e11141ca1d80 /examples/quantize-stats/quantize-stats.cpp
parent993ca95e9e3108f0352fa2a3384cab0775c7f7c1 (diff)
Adding IQ4_KSS: 4.0 bpw quants (#89)
* iq4_kss: WIP * iq4_kss: CUDA dequantize works So we can run perplexity. Sadly, the result does not look good on the bpw vs quantization error plot. * iq4_kss: slightly better quantization * iq4_kss: another small quantization improvement * iq4_kss: CUDA works TG-128 performance is very decent with 131 t/s for LLaMA-3.1-8B. In comparison, we have 123 t/s for q4_0 and 128 t/s for iq4_ks. I.e., the reduced model size more than offsets the additional bit fiddling required for iq4_kss. * iq4_kss: new bit arrangement - CUDA and Zen4 work Did not lose performance on CUDA. Zen4 is decent, but not great: PP-512(LLaMA-3.1-8B) = 163 t/s. TG-128 is of course better than other 4-bit quants due to smaller model size. We get 14.5 t/s @ 8 threads. * iq4_kss: ARM_NEON. Predictably very slow * iq4_kss: Metal PP is not too bad - just 10% slower than q4_0. But TG is 30% slower, i.e., predictably bad. * iq4_kss: somewhat faster Metal dot product 45.75 t/s -> 48.75 t/s. Still 22% slower than q4_0 * iq4_kss: AVX2 Bad, but better than I expected. PP-512(LLaMA-3.1-8B) = 167 t/s on the Ryzen-5950X. I.e., with 32 AVX2 threads we get the performance of 16 Zen4 threads. * iq4_kss: very slightly faster Metal dot product 48.7 t/s -> 49.3 t/s --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'examples/quantize-stats/quantize-stats.cpp')
-rw-r--r--examples/quantize-stats/quantize-stats.cpp50
1 files changed, 34 insertions, 16 deletions
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 34d05bf2..ff4e9bd4 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -256,6 +256,8 @@ static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const flo
float mse0 = 0, mse = 0;
auto compute = [&mutex, &counter, &mse0, &mse, values, row_size, nblock, nrows, n_per_row, chunk] () {
std::vector<char> Q(row_size);
+ float diff[4];
+ float xv[4];
float lmse0 = 0, lmse = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
@@ -282,25 +284,41 @@ static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const flo
for (int j = 0; j < 16; j += 2) {
uint16_t v0 = *(const uint16_t *)(qs + j);
int non = popcount(v0);
- float diff1 = xb[j+ 0] - dl*values[qs[j+0] & 0xf];
- float diff2 = xb[j+16] - dl*values[qs[j+0] >> 4];
- float diff3 = xb[j+ 1] - dl*values[qs[j+1] & 0xf];
- float diff4 = xb[j+17] - dl*values[qs[j+1] >> 4];
- lmse0 += diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
+ xv[0] = xb[j+ 0]; xv[1] = xb[j+16]; xv[2] = xb[j+ 1]; xv[3] = xb[j+17];
+ diff[0] = xv[0] - dl*values[qs[j+0] & 0xf];
+ diff[1] = xv[1] - dl*values[qs[j+0] >> 4];
+ diff[2] = xv[2] - dl*values[qs[j+1] & 0xf];
+ diff[3] = xv[3] - dl*values[qs[j+1] >> 4];
+ float diff4 = diff[0]*diff[0] + diff[1]*diff[1] + diff[2]*diff[2] + diff[3]*diff[3];
+ lmse0 += diff4;
if (non%2 == 0) {
- lmse += diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
+ lmse += diff4;
} else {
float best = std::numeric_limits<float>::max();
- for (int k = 0; k < 16; k += 4) {
- uint16_t v = v0 ^ (1 << k);
- uint8_t v1 = v;
- uint8_t v2 = v >> 8;
- diff1 = xb[j+ 0] - dl*values[v1 & 0xf];
- diff2 = xb[j+16] - dl*values[v1 >> 4];
- diff3 = xb[j+ 1] - dl*values[v2 & 0xf];
- diff4 = xb[j+17] - dl*values[v2 >> 4];
- float score = diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
- if (score < best) best = score;
+ //for (int k = 0; k < 16; k += 4) {
+ // uint16_t v = v0 ^ (1 << k);
+ // uint8_t v1 = v;
+ // uint8_t v2 = v >> 8;
+ // diff1 = xb[j+ 0] - dl*values[v1 & 0xf];
+ // diff2 = xb[j+16] - dl*values[v1 >> 4];
+ // diff3 = xb[j+ 1] - dl*values[v2 & 0xf];
+ // diff4 = xb[j+17] - dl*values[v2 >> 4];
+ // float score = diff1*diff1 + diff2*diff2 + diff3*diff3 + diff4*diff4;
+ // if (score < best) best = score;
+ //}
+ for (int k = 0; k < 4; ++k) {
+ uint16_t v = (v0 >> 4*k) & 0xf;
+ auto pc = popcount(v);
+ if (v > 0 && popcount(v-1u) != pc) {
+ float this_diff = xv[k] - dl*values[v-1u];
+ float score = diff4 - diff[k]*diff[k] + this_diff*this_diff;
+ if (score < best) best = score;
+ }
+ if (v < 15 && popcount(v + 1u) != pc) {
+ float this_diff = xv[k] - dl*values[v+1u];
+ float score = diff4 - diff[k]*diff[k] + this_diff*this_diff;
+ if (score < best) best = score;
+ }
}
lmse += best;
}