From 1442677f92e45a475be7b4d056e3633d1d6f813b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 4 Jun 2024 21:23:39 +0300 Subject: common : refactor cli arg parsing (#7675) * common : gpt_params_parse do not print usage * common : rework usage print (wip) * common : valign * common : rework print_usage * infill : remove cfg support * common : reorder args * server : deduplicate parameters ggml-ci * common : add missing header ggml-ci * common : remote --random-prompt usages ggml-ci * examples : migrate to gpt_params ggml-ci * batched-bench : migrate to gpt_params * retrieval : migrate to gpt_params * common : change defaults for escape and n_ctx * common : remove chatml and instruct params ggml-ci * common : passkey use gpt_params --- examples/batched/batched.cpp | 73 +++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 48 deletions(-) (limited to 'examples/batched/batched.cpp') diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 591bc6e5..62d9b144 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -7,48 +7,31 @@ #include #include -int main(int argc, char ** argv) { - gpt_params params; - - if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]); - return 1 ; - } - - // number of parallel batches - int n_parallel = 1; +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); - // total length of the sequences including the prompt - int n_len = 32; - - // number of layers to offload to the GPU - int n_gpu_layers = 0; - - if (argc >= 2) { - params.model = argv[1]; - } + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); + LOG_TEE("\n"); +} - if (argc >= 3) { - params.prompt = argv[2]; - } +int main(int argc, char ** argv) { + gpt_params params; - if (argc >= 4) { - n_parallel = std::atoi(argv[3]); - } + params.prompt = "Hello my name is"; + params.n_predict = 32; - if (argc >= 5) { - n_len = std::atoi(argv[4]); + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; } - if (argc >= 6) { - n_gpu_layers = std::atoi(argv[5]); - } - if (params.prompt.empty()) { - params.prompt = "Hello my name is"; - } + // number of parallel batches + int n_parallel = params.n_parallel; - string_process_escapes(params.prompt); + // total length of the sequences including the prompt + int n_predict = 32; // init LLM @@ -57,9 +40,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_default_params(); - - model_params.n_gpu_layers = n_gpu_layers; + llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -73,18 +54,14 @@ int main(int argc, char ** argv) { std::vector tokens_list; tokens_list = ::llama_tokenize(model, params.prompt, true); - const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel; + const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; // initialize the context - llama_context_params ctx_params = llama_context_default_params(); + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); - ctx_params.seed = 1234; ctx_params.n_ctx = n_kv_req; - ctx_params.n_batch = std::max(n_len, n_parallel); - ctx_params.n_seq_max = n_parallel; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.n_batch = std::max(n_predict, n_parallel); llama_context * ctx = llama_new_context_with_model(model, ctx_params); @@ -93,9 +70,9 @@ int main(int argc, char ** argv) { return 1; } - const int n_ctx = llama_n_ctx(ctx); + const int n_ctx = llama_n_ctx(ctx); - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { @@ -156,7 +133,7 @@ int main(int argc, char ** argv) { const auto t_main_start = ggml_time_us(); - while (n_cur <= n_len) { + while (n_cur <= n_predict) { // prepare the next batch llama_batch_clear(batch); @@ -192,7 +169,7 @@ int main(int argc, char ** argv) { //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of generation? -> mark the stream as finished - if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { + if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; LOG_TEE("\n"); if (n_parallel > 1) { -- cgit v1.2.3