Reduce size of compute buffers (#237)

* This reduces compute buffer size for MLA * This should accomplish it for standard attention * Much better * Better concat for contiguous tensors If all the op does is to concatenate the second tensor to the first, why would we want to have a loop? --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2025-03-01 08:25:27 +0200
committer: GitHub <noreply@github.com> 2025-03-01 08:25:27 +0200
commit: a79ab8f34222e1e0142a30eaa97e78ad077abca9 (patch)
tree: 24f89079780736d697347e1ebbe6544750534e22 /common/common.h
parent: b762db7c9264199c2d0f66e7d63e3b4884f3fc0c (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/common/common.h b/common/common.h
index ef5175f3..f35f3558 100644
--- a/common/common.h
+++ b/common/common.h
@@ -175,7 +175,8 @@ struct gpt_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
-    int  mla_attn          = false; // MLA 0: standard attention, 1: MLA with K and transposed V cache, 2: MLA with just K cache
+    int  mla_attn          = 0;     // MLA 0: standard attention, 1: MLA with K and transposed V cache, 2: MLA with just K cache
+    int  attn_max_batch    = 0;     // Max batch size to use when computing attention (only applicable if flash_attn = false)
     bool fused_moe_up_gate = false; // fused up*unary(gate) op for MoE models
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
author	Kawrakow <iwankawrakow@gmail.com>	2025-03-01 08:25:27 +0200
committer	GitHub <noreply@github.com>	2025-03-01 08:25:27 +0200
commit	a79ab8f34222e1e0142a30eaa97e78ad077abca9 (patch)
tree	24f89079780736d697347e1ebbe6544750534e22 /common/common.h
parent	b762db7c9264199c2d0f66e7d63e3b4884f3fc0c (diff)