From 7b1b2b2c06c1729139135c9e47611af7161de6f7 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 5 Sep 2024 07:46:47 +0300 Subject: Zen4 Flash Attention - bf16 support (#38) * Zen4 Flash Attnetion: WIP bf16 * Zen4 Flash Attnetion: bf16 seems to be working * Zen4 Flash Attnetion: improving bf16 * Zen4 Flash Attnetion: improving bf16 It is better (slightly faster) to first convert Q to bf16 before processing each block of q_step rows. This requires D*q_step*sizeof(bf16) bytes, so at most 4 kb for the head sizes we support, so we can just allocate on the stack instead of reserving and passing a work buffer in ggml. --------- Co-authored-by: Iwan Kawrakow --- common/common.cpp | 3 +++ 1 file changed, 3 insertions(+) (limited to 'common/common.cpp') diff --git a/common/common.cpp b/common/common.cpp index c86d364f..6c298d2d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2221,6 +2221,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) { if (s == "f16") { return GGML_TYPE_F16; } + if (s == "bf16") { + return GGML_TYPE_BF16; + } if (s == "q8_0") { return GGML_TYPE_Q8_0; } -- cgit v1.2.3