From 51029edfdf286df76f9268fc87b9514291b2fe42 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Thu, 27 Feb 2025 08:42:18 +0200
Subject: Faster MLA on CUDA (#234)

* Slight MLA TG performance improvement on CUDA

The low MLA performance on CUDA is dues to
the wk_b * q_nope operation.

It turns into n_head matrix multiplications with
n_head separate quantization and GEMV steps.
The associated overhead is just too much for TG
where each GEMV is very fast (512 x 128 = 131 KFLOP
for DeepSeek-Lite, 4X that for DeepSeekV3/R1).
The way it was done there was also a copy of each q_nope
row before quantization, which I have now eliminated.
This results in a ~2.5% speedup.
What needs to happen instead is to launch a single
computation that quantizes all heads, and then have
a kernel that does the GEMV for all heads instead of
n_head sequential GEMVs.

* Slightly better

* CUDA: Quantize non-contiguous tensors

* Much better MLA

It is a total hack, but it works.

* Cleanup

Remove duplicated gemv's.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 src/llama.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/llama.cpp b/src/llama.cpp
index 0276b69c..ebc7a772 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3212,7 +3212,7 @@ static bool llama_kv_cache_init(
             ggml_tensor * kv = ggml_new_tensor_2d(ctx, cache.type_k, kv_lora_rank + n_embd_head_qk_rope, kv_size);
             //ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_k, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
 #else
-            ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_v, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
+            ggml_tensor * kv = ggml_new_tensor_2d(ctx, cache.type_v, kv_lora_rank + n_embd_head_qk_rope, kv_size);
 #endif
             ggml_format_name(kv, "cache_kv_l%d", i);
             cache.kv_l.push_back(kv);
@@ -13579,6 +13579,7 @@ struct llm_build_context {
                     cb(wk_b, "wk_b", il);
 
                     q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+                    //if (q_nope->ne[1] <= 32) q_nope = ggml_cont(ctx0, q_nope);
                     cb(q_nope, "q_nope_perm", il);
 
                     struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
-- 
cgit v1.2.3