From f27cd405422307e02dffa8949ac30bc56b4d2900 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 12 May 2025 07:49:51 +0300 Subject: Enable faster prompt processing with mainline llama.cpp GGUFs (#409) * Enable MLA-3 in crippled GGUFs: WIP * Enable MLA-3 in crippled GGUFs: seems to work * Add newly created tensors to model.tensors_by_name Else they don't get run-time repacked. --------- Co-authored-by: Iwan Kawrakow --- include/llama.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/llama.h') diff --git a/include/llama.h b/include/llama.h index f1511548..0f3ae862 100644 --- a/include/llama.h +++ b/include/llama.h @@ -325,6 +325,7 @@ extern "C" { struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t mla; // MLA implementation to use (only applicable to DeepSeek models at this point) enum llama_split_mode split_mode; // how to split the model across multiple GPUs // main_gpu interpretation depends on split_mode: -- cgit v1.2.3