From f27cd405422307e02dffa8949ac30bc56b4d2900 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 12 May 2025 07:49:51 +0300 Subject: Enable faster prompt processing with mainline llama.cpp GGUFs (#409) * Enable MLA-3 in crippled GGUFs: WIP * Enable MLA-3 in crippled GGUFs: seems to work * Add newly created tensors to model.tensors_by_name Else they don't get run-time repacked. --------- Co-authored-by: Iwan Kawrakow --- common/common.cpp | 1 + 1 file changed, 1 insertion(+) (limited to 'common/common.cpp') diff --git a/common/common.cpp b/common/common.cpp index ab936ee7..0dbde58f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2334,6 +2334,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.mla = params.mla_attn; mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; -- cgit v1.2.3