From 51029edfdf286df76f9268fc87b9514291b2fe42 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Thu, 27 Feb 2025 08:42:18 +0200 Subject: Faster MLA on CUDA (#234) * Slight MLA TG performance improvement on CUDA The low MLA performance on CUDA is dues to the wk_b * q_nope operation. It turns into n_head matrix multiplications with n_head separate quantization and GEMV steps. The associated overhead is just too much for TG where each GEMV is very fast (512 x 128 = 131 KFLOP for DeepSeek-Lite, 4X that for DeepSeekV3/R1). The way it was done there was also a copy of each q_nope row before quantization, which I have now eliminated. This results in a ~2.5% speedup. What needs to happen instead is to launch a single computation that quantizes all heads, and then have a kernel that does the GEMV for all heads instead of n_head sequential GEMVs. * Slightly better * CUDA: Quantize non-contiguous tensors * Much better MLA It is a total hack, but it works. * Cleanup Remove duplicated gemv's. --------- Co-authored-by: Iwan Kawrakow --- src/llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/llama.cpp b/src/llama.cpp index 0276b69c..ebc7a772 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3212,7 +3212,7 @@ static bool llama_kv_cache_init( ggml_tensor * kv = ggml_new_tensor_2d(ctx, cache.type_k, kv_lora_rank + n_embd_head_qk_rope, kv_size); //ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_k, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); #else - ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_v, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); + ggml_tensor * kv = ggml_new_tensor_2d(ctx, cache.type_v, kv_lora_rank + n_embd_head_qk_rope, kv_size); #endif ggml_format_name(kv, "cache_kv_l%d", i); cache.kv_l.push_back(kv); @@ -13579,6 +13579,7 @@ struct llm_build_context { cb(wk_b, "wk_b", il); q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); + //if (q_nope->ne[1] <= 32) q_nope = ggml_cont(ctx0, q_nope); cb(q_nope, "q_nope_perm", il); struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope); -- cgit v1.2.3