From e05cca9ef652eee7b42927485a3821b14e3c565f Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 20 Jun 2024 15:20:50 +0300 Subject: bitnet(scale in a separate tensor): CPU improvements Arrange Q8 quants in blocks of 128 and adapt iqk_mul_mat to deal with that. This improves PP speef by a few percent. --- iqk-quantize.cpp | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) (limited to 'iqk-quantize.cpp') diff --git a/iqk-quantize.cpp b/iqk-quantize.cpp index 6622d5ba..1a672803 100644 --- a/iqk-quantize.cpp +++ b/iqk-quantize.cpp @@ -374,29 +374,51 @@ void quantize_row_q8_K64_reference(const float * x, block_q8_K64 * y, int64_t k) // x += 64; //} - for (int i = 0; i < nb; i++) { - + block_q8_K128 * yp = (block_q8_K128 *)y; + for (int i = 0; i < nb/2; i++) { float max = 0; float amax = 0; - for (int j = 0; j < 64; ++j) { + for (int j = 0; j < 128; ++j) { float ax = fabsf(x[j]); if (ax > amax) { amax = ax; max = x[j]; } } if (!amax) { - y[i].d = 0; - memset(y[i].qs, 0, 64); - x += 64; + yp[i].d = 0; + memset(yp[i].qs, 0, 128); + x += 128; continue; } const float iscale = -127.f/max; - for (int j = 0; j < 64; ++j) { + for (int j = 0; j < 128; ++j) { int v = nearest_int(iscale*x[j]); - y[i].qs[j] = MIN(127, v); + yp[i].qs[j] = MIN(127, v); + } + yp[i].d = 1/iscale; + x += 128; + } + int i = 2*(nb/2); + if (i < nb) { + float max = 0; + float amax = 0; + for (int j = 0; j < 64; ++j) { + float ax = fabsf(x[j]); + if (ax > amax) { + amax = ax; max = x[j]; + } + } + if (!amax) { + yp[i/2].d = 0; + memset(yp[i/2].qs, 0, 64); + } else { + const float iscale = -127.f/max; + for (int j = 0; j < 64; ++j) { + int v = nearest_int(iscale*x[j]); + yp[i/2].qs[j] = MIN(127, v); + } + yp[i/2].d = 1/iscale; } - y[i].d = 1/iscale; - x += 64; } } -- cgit v1.2.3