From 7a9b6c3a8bdc1cb75fefc826dfaa7331eb63695d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 23:17:37 +0200
Subject: Reduce memory usage and allocate enough memory for largest context
 (#473)

* Reduce memory usage and allocate enough memory for large contexts

* Simpler scratch buffer usage

* Reenable BLAS for quantized mul_mat

* Fix number of layers in 30B and 65B

* Fix KV cache size for F32
---
 utils.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'utils.h')

diff --git a/utils.h b/utils.h
index cf914990..d469bc6a 100644
--- a/utils.h
+++ b/utils.h
@@ -14,12 +14,13 @@
 //
 
 struct gpt_params {
-    int32_t seed          = -1;  // RNG seed
+    int32_t seed          = -1;   // RNG seed
     int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict     = 128; // new tokens to predict
-    int32_t repeat_last_n = 64;  // last n tokens to penalize
-    int32_t n_parts       = -1;  // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512; //context size
+    int32_t n_predict     = 128;  // new tokens to predict
+    int32_t repeat_last_n = 64;   // last n tokens to penalize
+    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
+    int32_t n_ctx         = 512;  // context size
+    int32_t n_batch       = 8;    // batch size for prompt processing
 
     // sampling parameters
     int32_t top_k = 40;
@@ -27,15 +28,13 @@ struct gpt_params {
     float   temp  = 0.80f;
     float   repeat_penalty  = 1.10f;
 
-    int32_t n_batch = 8; // batch size for prompt processing
-
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
 
 
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
-    bool memory_f16        = false; // use f16 instead of f32 for memory kv
+    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
@@ -47,6 +46,7 @@ struct gpt_params {
     bool ignore_eos        = false; // do not stop generating after eos
     bool perplexity        = false; // compute perplexity over the prompt
     bool use_mlock         = false; // use mlock to keep model in memory
+    bool mem_test          = false; // compute maximum memory usage
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-- 
cgit v1.2.3