From 7a9b6c3a8bdc1cb75fefc826dfaa7331eb63695d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 24 Mar 2023 23:17:37 +0200 Subject: Reduce memory usage and allocate enough memory for largest context (#473) * Reduce memory usage and allocate enough memory for large contexts * Simpler scratch buffer usage * Reenable BLAS for quantized mul_mat * Fix number of layers in 30B and 65B * Fix KV cache size for F32 --- utils.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'utils.h') diff --git a/utils.h b/utils.h index cf914990..d469bc6a 100644 --- a/utils.h +++ b/utils.h @@ -14,12 +14,13 @@ // struct gpt_params { - int32_t seed = -1; // RNG seed + int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 128; // new tokens to predict - int32_t repeat_last_n = 64; // last n tokens to penalize - int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) - int32_t n_ctx = 512; //context size + int32_t n_predict = 128; // new tokens to predict + int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) + int32_t n_ctx = 512; // context size + int32_t n_batch = 8; // batch size for prompt processing // sampling parameters int32_t top_k = 40; @@ -27,15 +28,13 @@ struct gpt_params { float temp = 0.80f; float repeat_penalty = 1.10f; - int32_t n_batch = 8; // batch size for prompt processing - std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; std::vector antiprompt; // string upon seeing which more user input is prompted - bool memory_f16 = false; // use f16 instead of f32 for memory kv + bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode @@ -47,6 +46,7 @@ struct gpt_params { bool ignore_eos = false; // do not stop generating after eos bool perplexity = false; // compute perplexity over the prompt bool use_mlock = false; // use mlock to keep model in memory + bool mem_test = false; // compute maximum memory usage }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); -- cgit v1.2.3