ggml : adjust mul_mat_f16 work memory (#1226)

* llama : minor - remove explicity int64_t cast * ggml : reduce memory buffer for F16 mul_mat when not using cuBLAS * ggml : add asserts to guard for incorrect wsize
author: Georgi Gerganov <ggerganov@gmail.com> 2023-04-29 18:43:28 +0300
committer: GitHub <noreply@github.com> 2023-04-29 18:43:28 +0300
commit: 214b6a35702a489e3738acd81fad6d46182d3036 (patch)
tree: dac39b6d4bb7eaf958735a0dfb5ccabcbbb0821c /llama.cpp
parent: 305eb5afd51325e3142c01c17431febb7c67de87 (diff)
1 files changed, 1 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index dc4bdc53..f8b4c8e4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -780,7 +780,7 @@ static bool kv_cache_init(
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
 
-    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
+    const int64_t n_mem      = n_layer*n_ctx;
     const int64_t n_elements = n_embd*n_mem;
 
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-04-29 18:43:28 +0300
committer	GitHub <noreply@github.com>	2023-04-29 18:43:28 +0300
commit	214b6a35702a489e3738acd81fad6d46182d3036 (patch)
tree	dac39b6d4bb7eaf958735a0dfb5ccabcbbb0821c /llama.cpp
parent	305eb5afd51325e3142c01c17431febb7c67de87 (diff)