DeepSeek imatrix stuff (#250)

* This gives us ~20% TG speedup for DeepSeek on CUDA * Slightly better * Also do it for plain (not fused) mul_mat_id * Guard against numerical precision issues for MLA on CUDA * imatrix: wv_b <-> wkv_b --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2025-03-10 16:19:09 +0200
committer: GitHub <noreply@github.com> 2025-03-10 16:19:09 +0200
commit: a48e16324770bb829406d06e11be1df0c8a3b517 (patch)
tree: 1f0ef5e1fd55c35acac40cca85cadc8606dd0673 /src
parent: 699c9cb7f63dd8431bce91b86e10efb41255f6c1 (diff)
1 files changed, 18 insertions, 0 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index bad8d33d..ba5c5052 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13787,6 +13787,7 @@ struct llm_build_context {
                                 ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank),
                                 ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank)*n_embd_head_v, 0);
                         cb(wv_b, "wv_b", il);
+                        std::memcpy(wv_b->name, model.layers[il].wv_b->name, GGML_MAX_NAME);
 
                         kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed);
                         cb(kqv, "kqv", il);
@@ -17348,6 +17349,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (imatrix_data) {
                 auto it = imatrix_data->find(tensor->name);
                 if (it == imatrix_data->end()) {
+                    // MLA hack: most imatrix files floating around the Internet have been computed with standard attention.
+                    //           This means that the imatrix file does not contain data for the *.attn_k_b.weight and *.attn_v_b.weight
+                    //           required by MLA. But the *.attn_v_b.weight tensors "see" the exact same activations as the
+                    //           *.attn_kv_b.weight tensors used in standard attention. Hence, if we find imatrix data for
+                    //           *.attn_kv_b.weight we can use it for *.attn_v_b.weight and vice versa.
+                    std::string name{tensor->name};
+                    static std::array<std::string, 2> alternatives{".attn_v_b.weight", ".attn_kv_b.weight"};
+                    for (int j = 0; j < int(alternatives.size()); ++j) {
+                        if (auto pos = name.find(alternatives[j]); pos != std::string::npos) {
+                            int j1 = (j + 1) % alternatives.size();
+                            auto alternative_name = name.substr(0, pos) + alternatives[j1];
+                            it = imatrix_data->find(alternative_name);
+                            break;
+                        }
+                    }
+                }
+                if (it == imatrix_data->end()) {
                     LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                 } else {
                     if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
author	Kawrakow <iwankawrakow@gmail.com>	2025-03-10 16:19:09 +0200
committer	GitHub <noreply@github.com>	2025-03-10 16:19:09 +0200
commit	a48e16324770bb829406d06e11be1df0c8a3b517 (patch)
tree	1f0ef5e1fd55c35acac40cca85cadc8606dd0673 /src
parent	699c9cb7f63dd8431bce91b86e10efb41255f6c1 (diff)