From d0b52076da0261f291b01f1ffa44884c8b2cdb1c Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Thu, 27 Mar 2025 05:49:16 +0100 Subject: Use bf16 instead of fp16 block scales for q8_1 (#292) * WIP - not working * q8_0 without bells and wistles works * It works for q8_0 * Use bf16 instead of f16,int16 * q4_0_r8 * q5_0_r4 * q6_0_r4 * Also q4_1 and q5_1 * q8_0_r8 on avx2 --------- Co-authored-by: Iwan Kawrakow --- ggml/src/iqk/iqk_quantize.cpp | 67 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 14 deletions(-) (limited to 'ggml/src/iqk/iqk_quantize.cpp') diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 5e657f4a..cac1fd49 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -798,13 +798,14 @@ void quantize_row_q8_0_x4(const float * x, void * vy, int64_t k) { #endif } -void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { +namespace { +template +void quantize_row_q8_1_x4_T(const float * x, Block * y, int64_t k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; const int nb4 = 4*(nb/4); - block_q8_1 * y = (block_q8_1 *)vy; - block_q8_1_x4 * y4 = (block_q8_1_x4 *)vy; + Block_x4 * y4 = (Block_x4 *)y; #if defined(__aarch64__) for (int i = 0; i < nb; i++) { int i4 = i/4, ir = i%4; @@ -851,10 +852,18 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { accv = vaddq_s32(accv, vi); } - if (i < nb4) { - y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); + if constexpr (std::is_same_v) { + if (i < nb4) { + y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); + } else { + y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); + } } else { - y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); + if (i < nb4) { + y4[i4].s[ir] = vaddvq_s32(accv); + } else { + y[i].s = vaddvq_s32(accv); + } } } #else @@ -880,13 +889,25 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { const float max_scalar = _mm_cvtss_f32( max4 ); // Quantize these floats - const float d = max_scalar / 127.f; - if (i < nb4) { - y4[i4].d[ir] = GGML_FP32_TO_FP16(d); + float d = max_scalar / 127.f; + if constexpr (std::is_same_v) { + if (i < nb4) { + y4[i4].d[ir] = GGML_FP32_TO_FP16(d); + } else { + y[i].d = GGML_FP32_TO_FP16(d); + } } else { - y[i].d = GGML_FP32_TO_FP16(d); + if (i < nb4) { + auto t = GGML_FP32_TO_BF16(d); + y4[i4].d[ir] = t.bits; + d = ggml_bf16_to_fp32(t); + } else { + auto t = GGML_FP32_TO_BF16(d); + y[i].d = t.bits; + d = ggml_bf16_to_fp32(t); + } } - const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; + const float id = d > 0 ? 1/d : 0.f; const __m256 mul = _mm256_set1_ps( id ); // Apply the multiplier @@ -908,10 +929,19 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { __m256i i3 = _mm256_cvtps_epi32( v3 ); // Compute the sum of the quants and set y[i].s - if (i < nb4) { - y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + int isum = hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + if constexpr (std::is_same_v) { + if (i < nb4) { + y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * isum); + } else { + y[i].s = GGML_FP32_TO_FP16(d * isum); + } } else { - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + if (i < nb4) { + y4[i4].d[ir+4] = GGML_FP32_TO_BF16(d * isum).bits; + } else { + y[i].s = GGML_FP32_TO_BF16(d * isum).bits; + } } // Convert int32 to int16 @@ -934,6 +964,15 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { } #endif } +} + +void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { + quantize_row_q8_1_x4_T(x, (block_q8_1 *)vy, k); +} + +void quantize_row_q8_2_x4(const float * x, void * vy, int64_t k) { + quantize_row_q8_1_x4_T(x, (block_q8_2 *)vy, k); +} // // ============================================== iq2_K -- cgit v1.2.3