From 699c9cb7f63dd8431bce91b86e10efb41255f6c1 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 10 Mar 2025 16:16:51 +0200 Subject: Faster MoE token generation on CUDA (#248) * This gives us ~20% TG speedup for DeepSeek on CUDA * Slightly better * Also do it for plain (not fused) mul_mat_id * Guard against numerical precision issues for MLA on CUDA --------- Co-authored-by: Iwan Kawrakow --- src/llama.cpp | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src') diff --git a/src/llama.cpp b/src/llama.cpp index 7d665072..bad8d33d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13734,6 +13734,9 @@ struct llm_build_context { } ggml_tensor * kq = ggml_mul_mat(ctx0, kv_cache, q); + if (kv_cache->ne[1] < 256) { + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + } cb(kq, "kq", il); if (!pp_opt) { -- cgit v1.2.3