llama : pad KV cache size (#4280)

* llama : pad KV cache size to 32 * metal : try to improve batched decoding
author: Georgi Gerganov <ggerganov@gmail.com> 2023-12-03 10:58:16 +0200
committer: GitHub <noreply@github.com> 2023-12-03 10:58:16 +0200
commit: d7b800b8bc490a221acbd83c575206a907f2f6e2 (patch)
tree: c41c5d8ead5fb3cb23ea0b5bca51f92a58e0d7cf /llama.cpp
parent: 5a7d3125e7c24f223659b7f0b7aa7736986e92c0 (diff)
1 files changed, 1 insertions, 2 deletions
diff --git a/llama.cpp b/llama.cpp
index 3f5d663c..fd905ade 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5744,8 +5744,7 @@ static int llama_decode_internal(
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
-    //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
+    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
 
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-12-03 10:58:16 +0200
committer	GitHub <noreply@github.com>	2023-12-03 10:58:16 +0200
commit	d7b800b8bc490a221acbd83c575206a907f2f6e2 (patch)
tree	c41c5d8ead5fb3cb23ea0b5bca51f92a58e0d7cf /llama.cpp
parent	5a7d3125e7c24f223659b7f0b7aa7736986e92c0 (diff)