diff options
author | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-20 18:39:31 +0300 |
---|---|---|
committer | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-22 12:02:52 +0300 |
commit | f0325c5826c55bb9796485d49bc971a17735e96a (patch) | |
tree | e70069ee59e64f3882468cc65f09831ae266d744 /iqk-quantize.cpp | |
parent | e05cca9ef652eee7b42927485a3821b14e3c565f (diff) |
bitnet(scale in a separate tensor): more CPU improvements
It seems it is enough to have 4 scales per row for Q8.
I get PPL = 8.5470 with this, which is slightly higher than
the 8.5430 we get with 1 scale per 128 activations, but still
OK, I think.
With this, we get the following performance:
Systema | quant | PP-512 | TG-128a | quant | PP-512 | TG-12s |
M2 Max | iq2bn 229.02 ± 0.37 78.75 ± 0.61 | iq1bn | 146.67 ± 2.85 33.12 ± 0.03
Ryzen7950| iq2bn 379.36 ± 1.03 49.08 ± 0.18 | iq1bn | 247.12 ± 1.53 32.80 ± 0.02
Ryzen5975| iq2bn 465.28 ± 0.57 39.17 ± 0.02 | iq1bn | 325.86 ± 0.46 26.60 ± 0.10
Diffstat (limited to 'iqk-quantize.cpp')
-rw-r--r-- | iqk-quantize.cpp | 60 |
1 files changed, 17 insertions, 43 deletions
diff --git a/iqk-quantize.cpp b/iqk-quantize.cpp index 1a672803..40eff93f 100644 --- a/iqk-quantize.cpp +++ b/iqk-quantize.cpp @@ -355,8 +355,8 @@ void ggml_vec_dot_iq2_bn_q8_K64(int n, float * s, size_t bs, const void * vx, si } void quantize_row_q8_K64_reference(const float * x, block_q8_K64 * y, int64_t k) { - assert(k % 64 == 0); - const int64_t nb = k / 64; + //assert(k % 64 == 0); + //const int64_t nb = k / 64; // Check if a row-wise scale works. It almost does, PPL is only ~0.02 higher //float amax = 0; @@ -374,50 +374,24 @@ void quantize_row_q8_K64_reference(const float * x, block_q8_K64 * y, int64_t k) // x += 64; //} - block_q8_K128 * yp = (block_q8_K128 *)y; - for (int i = 0; i < nb/2; i++) { - float max = 0; - float amax = 0; - for (int j = 0; j < 128; ++j) { - float ax = fabsf(x[j]); - if (ax > amax) { - amax = ax; max = x[j]; + float aux[4] = {0.f, 0.f, 0.f, 0.f}; + for (int j = 0; j < k; j += 16) { + for (int i = 0; i < 4; ++i) { + for (int l = 0; l < 4; ++l) { + float ax = fabsf(x[j+4*i+l]); + aux[i] = std::max(aux[i], ax); } } - if (!amax) { - yp[i].d = 0; - memset(yp[i].qs, 0, 128); - x += 128; - continue; - } - const float iscale = -127.f/max; - for (int j = 0; j < 128; ++j) { - int v = nearest_int(iscale*x[j]); - yp[i].qs[j] = MIN(127, v); - } - yp[i].d = 1/iscale; - x += 128; } - int i = 2*(nb/2); - if (i < nb) { - float max = 0; - float amax = 0; - for (int j = 0; j < 64; ++j) { - float ax = fabsf(x[j]); - if (ax > amax) { - amax = ax; max = x[j]; - } - } - if (!amax) { - yp[i/2].d = 0; - memset(yp[i/2].qs, 0, 64); - } else { - const float iscale = -127.f/max; - for (int j = 0; j < 64; ++j) { - int v = nearest_int(iscale*x[j]); - yp[i/2].qs[j] = MIN(127, v); - } - yp[i/2].d = 1/iscale; + float * dptr = (float *)y; + for (int i = 0; i < 4; ++i) { + dptr[i] = aux[i]/127; + aux[i] = dptr[i] > 0 ? 1/dptr[i] : 0.f; + } + auto qs = (int8_t *)(dptr + 4); + for (int j = 0; j < k; j += 16) { + for (int i = 0; i < 4; ++i) { + for (int l = 0; l < 4; ++l) qs[j+4*i+l] = nearest_int(aux[i]*x[j+4*i+l]); } } } |