diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-01-21 19:19:38 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-01-21 19:19:38 +0200 |
commit | 6d23495b9bb8945c6ec1c38ced4b44180fbac3c6 (patch) | |
tree | bcb66bc98778555e21c537bdaabdf3bb9a5fb18b /ggml/src | |
parent | 3c5f87225f0ddd379ab712ddb8ad0013c10167c2 (diff) |
On Zen4 repack fp16 models to bf16_r16 when run-time-repacking is requested (#174)
This massively improves performance. As this is opt-in, we do not worry
about possible precision loss in the f16 -> bf16 conversion.
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src')
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index a2ade6a7..221bc48c 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -5260,6 +5260,7 @@ inline ggml_bf16_t to_bf16(const float& x) { helper.f = x; return ggml_bf16_t{(uint16_t)(helper.u >> 16)}; } +inline ggml_bf16_t to_bf16(const ggml_half& x) { return to_bf16(GGML_FP16_TO_FP32(x)); } inline ggml_bf16_t to_bf16(const ggml_bf16_t& x) { return x; } template <typename T> void repack_bf16(int nrows, int n_per_row, const T * x, ggml_bf16_t * y) { @@ -6082,7 +6083,8 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} }, { GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} }, #ifdef __AVX512BF16__ - { GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>} }, + { GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>}}, + { GGML_TYPE_F16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_half>} }, #endif }; |