From 16bc66d9479edd5ee12ec734973554d4493c5dfa Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 28 Sep 2023 21:42:38 +0200 Subject: llama.cpp : split llama_context_params into model and context params (#3301) * llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used --- examples/llama-bench/llama-bench.cpp | 157 ++++++++++++++++++++++++----------- 1 file changed, 109 insertions(+), 48 deletions(-) (limited to 'examples/llama-bench/llama-bench.cpp') diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 058e34d5..93bb0c8b 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -132,7 +132,6 @@ struct cmd_params { std::vector n_gpu_layers; std::vector main_gpu; std::vector mul_mat_q; - std::vector low_vram; std::vector> tensor_split; int reps; bool verbose; @@ -149,7 +148,6 @@ static const cmd_params cmd_params_defaults = { /* n_gpu_layers */ {99}, /* main_gpu */ {0}, /* mul_mat_q */ {true}, - /* low_vram */ {false}, /* tensor_split */ {{}}, /* reps */ 5, /* verbose */ false, @@ -167,9 +165,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -ngl N, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - printf(" -mg i, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -ts, --tensor_split \n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -255,13 +252,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.main_gpu = split(argv[i], split_delim); - } else if (arg == "-lv" || arg == "--low-vram") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = split(argv[i], split_delim); - params.low_vram.insert(params.low_vram.end(), p.begin(), p.end()); } else if (arg == "-mmq" || arg == "--mul-mat-q") { if (++i >= argc) { invalid_param = true; @@ -336,7 +326,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } - if (params.low_vram.empty()) { params.low_vram = cmd_params_defaults.low_vram; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -353,21 +342,34 @@ struct cmd_params_instance { int n_gpu_layers; int main_gpu; bool mul_mat_q; - bool low_vram; std::array tensor_split; - llama_context_params to_llama_params() const { - llama_context_params lparams = llama_context_default_params(); - lparams.n_ctx = n_prompt + n_gen; - lparams.n_batch = n_batch; - lparams.f16_kv = !f32_kv; - lparams.n_gpu_layers = n_gpu_layers; - lparams.main_gpu = main_gpu; - lparams.mul_mat_q = mul_mat_q; - lparams.low_vram = low_vram; - lparams.tensor_split = tensor_split.data(); + llama_model_params to_llama_mparams() const { + llama_model_params mparams = llama_model_default_params(); + + mparams.n_gpu_layers = n_gpu_layers; + mparams.main_gpu = main_gpu; + mparams.tensor_split = tensor_split.data(); + + return mparams; + } + + bool equal_mparams(const cmd_params_instance & other) const { + return model == other.model && + n_gpu_layers == other.n_gpu_layers && + main_gpu == other.main_gpu && + tensor_split == other.tensor_split; + } + + llama_context_params to_llama_cparams() const { + llama_context_params cparams = llama_context_default_params(); - return lparams; + cparams.n_ctx = n_prompt + n_gen; + cparams.n_batch = n_batch; + cparams.f16_kv = !f32_kv; + cparams.mul_mat_q = mul_mat_q; + + return cparams; } }; @@ -375,13 +377,12 @@ static std::vector get_cmd_params_instances_int(const cmd_p std::vector instances; for (const auto & m : params.model) - for (const auto & nb : params.n_batch) - for (const auto & fk : params.f32_kv) for (const auto & nl : params.n_gpu_layers) for (const auto & mg : params.main_gpu) - for (const auto & mmq : params.mul_mat_q) - for (const auto & lv : params.low_vram) for (const auto & ts : params.tensor_split) + for (const auto & nb : params.n_batch) + for (const auto & fk : params.f32_kv) + for (const auto & mmq : params.mul_mat_q) for (const auto & nt : params.n_threads) { cmd_params_instance instance = { /* .model = */ m, @@ -393,7 +394,6 @@ static std::vector get_cmd_params_instances_int(const cmd_p /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, /* .mul_mat_q = */ mmq, - /* .low_vram = */ lv, /* .tensor_split = */ ts, }; instances.push_back(instance); @@ -404,6 +404,56 @@ static std::vector get_cmd_params_instances_int(const cmd_p static std::vector get_cmd_params_instances(const cmd_params & params) { std::vector instances; +#if 1 + // this ordering minimizes the number of times that each model needs to be reloaded + for (const auto & m : params.model) + for (const auto & nl : params.n_gpu_layers) + for (const auto & mg : params.main_gpu) + for (const auto & ts : params.tensor_split) + for (const auto & nb : params.n_batch) + for (const auto & fk : params.f32_kv) + for (const auto & mmq : params.mul_mat_q) + for (const auto & nt : params.n_threads) { + for (const auto & n_prompt : params.n_prompt) { + if (n_prompt == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ n_prompt, + /* .n_gen = */ 0, + /* .n_batch = */ nb, + /* .f32_kv = */ fk, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .main_gpu = */ mg, + /* .mul_mat_q = */ mmq, + /* .tensor_split = */ ts, + }; + instances.push_back(instance); + } + + for (const auto & n_gen : params.n_gen) { + if (n_gen == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ 0, + /* .n_gen = */ n_gen, + /* .n_batch = */ nb, + /* .f32_kv = */ fk, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .main_gpu = */ mg, + /* .mul_mat_q = */ mmq, + /* .tensor_split = */ ts, + }; + instances.push_back(instance); + } + } +#else + // this ordering separates the prompt and generation tests for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { continue; @@ -419,6 +469,7 @@ static std::vector get_cmd_params_instances(const cmd_param auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0); instances.insert(instances.end(), instances_gen.begin(), instances_gen.end()); } +#endif return instances; } @@ -443,7 +494,6 @@ struct test { int n_gpu_layers; int main_gpu; bool mul_mat_q; - bool low_vram; std::array tensor_split; int n_prompt; int n_gen; @@ -463,7 +513,6 @@ struct test { n_gpu_layers = inst.n_gpu_layers; main_gpu = inst.main_gpu; mul_mat_q = inst.mul_mat_q; - low_vram = inst.low_vram; tensor_split = inst.tensor_split; n_prompt = inst.n_prompt; n_gen = inst.n_gen; @@ -524,7 +573,7 @@ struct test { "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "f16_kv", - "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split", + "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -543,7 +592,7 @@ struct test { return INT; } if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") { + field == "f16_kv" || field == "mul_mat_q") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -574,7 +623,7 @@ struct test { cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str, + std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -766,9 +815,6 @@ struct markdown_printer : public printer { if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { fields.push_back("mul_mat_q"); } - if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) { - fields.push_back("low_vram"); - } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { fields.push_back("tensor_split"); } @@ -889,17 +935,23 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { std::vector tokens(n_batch, llama_token_bos(ctx)); int n_processed = 0; + + llama_set_n_threads(ctx, n_threads, n_threads); + while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0), n_threads); + llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } } static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { llama_token token = llama_token_bos(ctx); + + llama_set_n_threads(ctx, n_threads, n_threads); + for (int i = 0; i < n_gen; i++) { - llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0), n_threads); + llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); } } @@ -958,17 +1010,25 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); + llama_model * lmodel = nullptr; + const cmd_params_instance * prev_inst = nullptr; + for (const auto & inst : params_instances) { - // TODO: keep the model between tests when possible - llama_context_params lparams = inst.to_llama_params(); + // keep the same model between tests when possible + if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { + if (lmodel) { + llama_free_model(lmodel); + } - llama_model * lmodel = llama_load_model_from_file(inst.model.c_str(), lparams); - if (lmodel == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); - return 1; + lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); + if (lmodel == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); + return 1; + } + prev_inst = &inst; } - llama_context * ctx = llama_new_context_with_model(lmodel, lparams); + llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); llama_free_model(lmodel); @@ -1006,9 +1066,10 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); - llama_free_model(lmodel); } + llama_free_model(lmodel); + p->print_footer(); llama_backend_free(); -- cgit v1.2.3