From 699c9cb7f63dd8431bce91b86e10efb41255f6c1 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Mon, 10 Mar 2025 16:16:51 +0200
Subject: Faster MoE token generation on CUDA (#248)

* This gives us ~20% TG speedup for DeepSeek on CUDA

* Slightly better

* Also do it for plain (not fused) mul_mat_id

* Guard against numerical precision issues for MLA on CUDA

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 src/llama.cpp | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src')

diff --git a/src/llama.cpp b/src/llama.cpp
index 7d665072..bad8d33d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13734,6 +13734,9 @@ struct llm_build_context {
                                 }
 
                                 ggml_tensor * kq = ggml_mul_mat(ctx0, kv_cache, q);
+                                if (kv_cache->ne[1] < 256) {
+                                    ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+                                }
                                 cb(kq, "kq", il);
 
                                 if (!pp_opt) {
-- 
cgit v1.2.3