From a79ab8f34222e1e0142a30eaa97e78ad077abca9 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Sat, 1 Mar 2025 08:25:27 +0200
Subject: Reduce size of compute buffers (#237)

* This reduces compute buffer size for MLA

* This should accomplish it for standard attention

* Much better

* Better concat for contiguous tensors

If all the op does is to concatenate the second tensor
to the first, why would we want to have a loop?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 common/common.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'common/common.h')

diff --git a/common/common.h b/common/common.h
index ef5175f3..f35f3558 100644
--- a/common/common.h
+++ b/common/common.h
@@ -175,7 +175,8 @@ struct gpt_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
-    int  mla_attn          = false; // MLA 0: standard attention, 1: MLA with K and transposed V cache, 2: MLA with just K cache
+    int  mla_attn          = 0;     // MLA 0: standard attention, 1: MLA with K and transposed V cache, 2: MLA with just K cache
+    int  attn_max_batch    = 0;     // Max batch size to use when computing attention (only applicable if flash_attn = false)
     bool fused_moe_up_gate = false; // fused up*unary(gate) op for MoE models
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-- 
cgit v1.2.3