diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2024-12-15 09:54:21 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-12-15 09:54:21 +0100 |
commit | 85c5a1a99569ccc00c280835fe3a69b4af02c43b (patch) | |
tree | da421487d5ddd0467b2bfd6cbbfb2666406c46f1 /ggml/src/iqk/iqk_quantize.cpp | |
parent | 20758edcae65213b2f575b6d23dfea67ad9dd0e0 (diff) |
BF16_R16 - 16 interleaved bf16 rows (#142)
* Not working bf16_r4
* Adding bf16_r8
Small performance gain compared to bf16 - 258 t/s vs 234 t/s.
I guess, this is still sub-obtimal.
* bf16_rx: Very slightly faster by interleaving 16 rows
258 t/s -> 263 t/s
* Rename bf16_r4 to bf16_r16
We are interleaving 16 rows now.
* Cleanup unused stuff
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/iqk/iqk_quantize.cpp')
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 38 |
1 files changed, 37 insertions, 1 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index de8c0d99..abe81858 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -4708,7 +4708,7 @@ static void repack_q8_k(int nrows, int n_per_row, const block_q8_K * x, block_q8 } } } - x += 4*nblock; + x += 8*nblock; y += nblock; } } @@ -4759,3 +4759,39 @@ void vec_dot_q8_k_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t b GGML_UNUSED(by); } +// +// ========================================= bf16_r4 +// +namespace { +inline ggml_bf16_t to_bf16(const float& x) { + union { float f; uint32_t u; } helper; + helper.f = x; + return ggml_bf16_t{(uint16_t)(helper.u >> 16)}; +} +inline ggml_bf16_t to_bf16(const ggml_bf16_t& x) { return x; } +template <typename T> +void repack_bf16(int nrows, int n_per_row, const T * x, ggml_bf16_t * y) { + GGML_ASSERT(nrows%16 == 0); + GGML_ASSERT(n_per_row%2 == 0); + for (int row = 0; row < nrows; row += 16) { + for (int k = 0; k < 16; ++k) { + auto x8 = x + k*n_per_row; + for (int ib = 0; ib < n_per_row/2; ++ib) { + y[32*ib + 2*k + 0] = to_bf16(x8[2*ib+0]); + y[32*ib + 2*k + 1] = to_bf16(x8[2*ib+1]); + } + } + x += 16*n_per_row; + y += 16*n_per_row; + } +} +} + +void repack_f32_bf16_r16(const void * src, void * dst, int64_t nrows, int64_t n_per_row) { + repack_bf16(nrows, n_per_row, (const float *)src, (ggml_bf16_t *)dst); +} + +void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row) { + repack_bf16(nrows, n_per_row, (const ggml_bf16_t *)src, (ggml_bf16_t *)dst); +} + |