From a79ab8f34222e1e0142a30eaa97e78ad077abca9 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sat, 1 Mar 2025 08:25:27 +0200 Subject: Reduce size of compute buffers (#237) * This reduces compute buffer size for MLA * This should accomplish it for standard attention * Much better * Better concat for contiguous tensors If all the op does is to concatenate the second tensor to the first, why would we want to have a loop? --------- Co-authored-by: Iwan Kawrakow --- include/llama.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/llama.h') diff --git a/include/llama.h b/include/llama.h index 2b33701c..bb43aebc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -384,6 +384,7 @@ extern "C" { bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] int mla_attn; // whether to use MLA attention [EXPERIMENTAL] + int attn_max_batch; // maximum batch size for attention computations [EXPERIMENTAL] bool fused_moe_up_gate; // whether to use fused MoE up/down op [EXPERIMENTAL] // Abort callback -- cgit v1.2.3