summaryrefslogtreecommitdiff
path: root/src/llama.cpp
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-03-10 16:16:51 +0200
committerGitHub <noreply@github.com>2025-03-10 16:16:51 +0200
commit699c9cb7f63dd8431bce91b86e10efb41255f6c1 (patch)
tree6000fd823e443f80f90ec490b1bbdf6461902924 /src/llama.cpp
parentb096a5de7a9bdf516bb20729d5d0a3b2a12cba2f (diff)
Faster MoE token generation on CUDA (#248)
* This gives us ~20% TG speedup for DeepSeek on CUDA * Slightly better * Also do it for plain (not fused) mul_mat_id * Guard against numerical precision issues for MLA on CUDA --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'src/llama.cpp')
-rw-r--r--src/llama.cpp3
1 files changed, 3 insertions, 0 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index 7d665072..bad8d33d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13734,6 +13734,9 @@ struct llm_build_context {
}
ggml_tensor * kq = ggml_mul_mat(ctx0, kv_cache, q);
+ if (kv_cache->ne[1] < 256) {
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+ }
cb(kq, "kq", il);
if (!pp_opt) {