From 465569dff8b49a195450a0eb1974fd72a32fcebc Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Mon, 12 May 2025 07:49:00 +0300
Subject: Faster DeepSeek FA on CUDA (#408)

* New DeepSeek FlashMLA

Does not work because the RoPE portion is stored at the end
in our case, while in mainline it is stored at the beginning,
and the FA kernel assumes that.

* Rearrange MLA K cache so it first new CUDA FA implementation

* constexpr and minor changes

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 src/llama.cpp | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/llama.cpp b/src/llama.cpp
index 38a2b299..b4d42c84 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15224,7 +15224,8 @@ struct llm_build_context {
                         cb(kv_cache_trans, "kv_cache_trans", il);
                     }
 
-                    ggml_tensor * kvr = ggml_concat(ctx0, kv_compressed, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), 0);
+                    //ggml_tensor * kvr = ggml_concat(ctx0, kv_compressed, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), 0);
+                    ggml_tensor * kvr = ggml_concat(ctx0, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), kv_compressed, 0);
                     cb(kvr, "kvr", il);
 
                     auto row_size = ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope);
@@ -15240,7 +15241,8 @@ struct llm_build_context {
 
                     if (lctx.cparams.mla_attn > 1 && lctx.cparams.flash_attn && pp_opt) { // PP for mla=2,3
 
-                        auto kv_cache_nope = ggml_view_2d(ctx0, kv_self.kv_l[il], kv_lora_rank, n_kv, kv_self.kv_l[il]->nb[1], 0);
+                        auto kv_cache_nope = ggml_view_2d(ctx0, kv_self.kv_l[il], kv_lora_rank, n_kv, kv_self.kv_l[il]->nb[1],
+                                ggml_row_size(kv_self.kv_l[il]->type, n_embd_head_qk_rope));
 
                         auto kv_f32_size = model.layers[il].wkv_b->ne[1] * kv_cache_nope->ne[1] * sizeof(float) / (1024*1024);
                         int n_max_head = n_head;
@@ -15254,7 +15256,7 @@ struct llm_build_context {
                         auto n_per_head = model.layers[il].wkv_b->ne[1] / n_head;
 
                         auto kv_cache_rope = ggml_view_3d(ctx0, kv_self.kv_l[il], n_embd_head_qk_rope, n_kv, 1,
-                                kv_self.kv_l[il]->nb[1], kv_self.kv_l[il]->nb[2], ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank));
+                                kv_self.kv_l[il]->nb[1], kv_self.kv_l[il]->nb[2], 0); //ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank));
 
                         // There is still an issue with one or more of the ops GGML_OP_REPEAT, GGML_OP_CONCAT, GGML_OP_CPY on CUDA when
                         // the KV cache is quantized. Hence, in that case we will simply use fp16 for now.
@@ -15273,7 +15275,8 @@ struct llm_build_context {
                         }
                         cb(k_rope, "k_rope", il);
 
-                        auto q = ggml_concat(ctx0, q_nope, q_rope, 0);
+                        //auto q = ggml_concat(ctx0, q_nope, q_rope, 0);
+                        auto q = ggml_concat(ctx0, q_rope, q_nope, 0);
                         q = ggml_permute(ctx0, q, 0, 2, 1, 3);
                         cb(q, "q_concat", il);
 
@@ -15307,7 +15310,8 @@ struct llm_build_context {
                             ggml_build_forward_expand(gf, k_nope);
                             ggml_build_forward_expand(gf, v);
 
-                            auto k = ggml_concat(ctx0, k_nope, k_rope, 0);
+                            //auto k = ggml_concat(ctx0, k_nope, k_rope, 0);
+                            auto k = ggml_concat(ctx0, k_rope, k_nope, 0);
                             cb(k, "k", il);
 
                             ggml_build_forward_expand(gf, k);
@@ -15344,13 +15348,15 @@ struct llm_build_context {
                         struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
                         cb(q_nope2, "q_nope2", il);
 
-                        ggml_tensor * q = ggml_concat(ctx0, q_nope2, ggml_permute(ctx0, q_rope, 0, 2, 1, 3), 0);
+                        //ggml_tensor * q = ggml_concat(ctx0, q_nope2, ggml_permute(ctx0, q_rope, 0, 2, 1, 3), 0);
+                        ggml_tensor * q = ggml_concat(ctx0, ggml_permute(ctx0, q_rope, 0, 2, 1, 3), q_nope2, 0);
                         cb(q, "q", il);
 
                         if (lctx.cparams.flash_attn && (lctx.cparams.mla_attn == 1 || lctx.cparams.mla_attn == 3)) {
                             ggml_tensor * kv_cache_lora = ggml_view_2d(ctx0, kv_self.kv_l[il],
                                     kv_lora_rank, n_kv,
-                                    ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+                                    ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope),
+                                    ggml_row_size(kv_self.kv_l[il]->type, n_embd_head_qk_rope));
                             cb(kv_cache_lora, "kv_cache_lora", il);
 
                             kqv_compressed = ggml_flash_attn_ext(ctx0, q, kv_cache, kv_cache_lora, KQ_mask, kq_scale, hparams.f_max_alibi_bias, 0.f);
@@ -15363,7 +15369,8 @@ struct llm_build_context {
                             if (lctx.cparams.mla_attn > 1) {
                                 ggml_tensor * kv_cache_lora = ggml_view_2d(ctx0, kv_self.kv_l[il],
                                         kv_lora_rank, n_kv,
-                                        ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+                                        ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope),
+                                        ggml_row_size(kv_self.kv_l[il]->type, n_embd_head_qk_rope));
                                 cb(kv_cache, "kv_cache_lora", il);
 
                                 kv_cache_trans = ggml_cont(ctx0, ggml_transpose(ctx0, kv_cache_lora));
-- 
cgit v1.2.3