diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-01-10 15:06:00 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-01-10 15:06:00 +0200 |
commit | b1363b6177661556750c110cf876e044e61af365 (patch) | |
tree | 5314e735bffc0eba02dd6c028e01cdd5fc863b02 /ggml/src/iqk/iqk_quantize.cpp | |
parent | 3e6851621c54e8424196810f2798811f069bcff1 (diff) |
Falcon3 changes (#168)
* Add Falcon3 pre-tokinizer (same as llama3)
* q8_k16: use integer arithmetic to sum row values
The existing implementation that just sums up the f32 quantizations
works fine for the original BitNet models and also for the TriLM
ternary models. But for Falcon3 I see a significant difference between
the CPU and the GPU perplexity. If I use the q8_K16 int8_t quants to sum
up the values in a row, then the CPU-GPU PPL difference becomes much
smaller, and we get a lower PPL than Microsoft BitNet, which claims
to be "losless".
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 13 |
1 files changed, 7 insertions, 6 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 095235cd..cdb564f5 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -553,7 +553,7 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) { float * dptr = (float *)vy; int8_t * qy = (int8_t *)(dptr + 5); int n64 = nk / 64; -#ifdef __AVX2__ +#ifdef z__AVX2__ __m256 sign_bit = _mm256_set1_ps(-0.f); __m256 vmax[4] = {}; __m256 vsum[4] = {}; @@ -594,7 +594,7 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) { qy += 32; } } -#elif defined __ARM_NEON +#elif defined z__ARM_NEON static const uint8_t k_shuffle[16] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}; auto shuffle = vld1q_u8(k_shuffle); float32x4_t vmax[4] = {}; @@ -640,16 +640,17 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) { dptr[k] = amax[k]/127; amax[k] = dptr[k] > 0 ? 1/dptr[k] : 0.f; } - double sumf = 0; + int sumi[4] = {}; for (int i64 = 0; i64 < n64; ++i64) { for (int k = 0; k < 4; ++k) { for (int j = 0; j < 16; ++j) { - sumf += x[64*i64 + 16*k + j]; - qy[64*i64 + 16*k + j] = nearest_int(amax[k]*x[64*i64 + 16*k + j]); + int ix = nearest_int(amax[k]*x[64*i64 + 16*k + j]); + sumi[k] += ix; + qy[64*i64 + 16*k + j] = ix; } } } - dptr[4] = sumf; + dptr[4] = dptr[0]*sumi[0] + dptr[1]*sumi[1] + dptr[2]*sumi[2] + dptr[3]*sumi[3]; #endif } |