From f27cd405422307e02dffa8949ac30bc56b4d2900 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Mon, 12 May 2025 07:49:51 +0300
Subject: Enable faster prompt processing with mainline llama.cpp GGUFs (#409)

* Enable MLA-3 in crippled GGUFs: WIP

* Enable MLA-3 in crippled GGUFs: seems to work

* Add newly created tensors to model.tensors_by_name

Else they don't get run-time repacked.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 include/llama.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/llama.h')

diff --git a/include/llama.h b/include/llama.h
index f1511548..0f3ae862 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -325,6 +325,7 @@ extern "C" {
 
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
+        int32_t mla;          // MLA implementation to use (only applicable to DeepSeek models at this point)
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
         // main_gpu interpretation depends on split_mode:
-- 
cgit v1.2.3