diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-06-04 21:23:39 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-04 21:23:39 +0300 |
commit | 1442677f92e45a475be7b4d056e3633d1d6f813b (patch) | |
tree | d9dbb111ccaedc44cba527dbddd90bedd1e04ea8 /common/common.cpp | |
parent | 554c247caffed64465f372661f2826640cb10430 (diff) |
common : refactor cli arg parsing (#7675)
* common : gpt_params_parse do not print usage
* common : rework usage print (wip)
* common : valign
* common : rework print_usage
* infill : remove cfg support
* common : reorder args
* server : deduplicate parameters
ggml-ci
* common : add missing header
ggml-ci
* common : remote --random-prompt usages
ggml-ci
* examples : migrate to gpt_params
ggml-ci
* batched-bench : migrate to gpt_params
* retrieval : migrate to gpt_params
* common : change defaults for escape and n_ctx
* common : remove chatml and instruct params
ggml-ci
* common : passkey use gpt_params
Diffstat (limited to 'common/common.cpp')
-rw-r--r-- | common/common.cpp | 823 |
1 files changed, 563 insertions, 260 deletions
diff --git a/common/common.cpp b/common/common.cpp index df583db8..c8df9a4c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -7,20 +7,21 @@ #include <algorithm> #include <cassert> +#include <cinttypes> #include <cmath> +#include <codecvt> +#include <cstdarg> #include <cstring> #include <ctime> #include <fstream> -#include <iterator> #include <iostream> +#include <iterator> #include <regex> #include <sstream> #include <string> #include <unordered_map> #include <unordered_set> #include <vector> -#include <cinttypes> -#include <codecvt> #if defined(__APPLE__) && defined(__MACH__) #include <sys/types.h> @@ -237,10 +238,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } - if (params.prompt_cache_all && - (params.interactive || params.interactive_first || - params.instruct)) { - + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } @@ -265,22 +263,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - bool result = true; + const auto params_org = params; // the example can modify the default params + try { - if (!gpt_params_parse_ex(argc, argv, params)) { - gpt_params_print_usage(argc, argv, gpt_params()); - exit(0); + if (!gpt_params_parse_ex(argc, argv, params) || params.usage) { + params = params_org; + params.usage = true; + return false; } - } - catch (const std::invalid_argument & ex) { + } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); - gpt_params_print_usage(argc, argv, gpt_params()); - exit(1); + return false; } - return result; + + return true; } bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { + const char split_delim = ','; + llama_sampling_params & sparams = params.sparams; if (arg == "-s" || arg == "--seed") { @@ -288,7 +289,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context. + // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. params.seed = std::stoul(argv[i]); sparams.seed = std::stoul(argv[i]); return true; @@ -349,6 +350,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.escape = true; return true; } + if (arg == "--no-escape") { + params.escape = false; + return true; + } if (arg == "--prompt-cache") { if (++i >= argc) { invalid_param = true; @@ -403,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "-n" || arg == "--n-predict") { + if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { if (++i >= argc) { invalid_param = true; return true; @@ -900,34 +905,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive = true; return true; } - if (arg == "--interactive-specials") { - params.interactive_specials = true; - return true; - } - if (arg == "--special") { + if (arg == "-sp" || arg == "--special") { params.special = true; return true; } - if (arg == "--embedding") { + if (arg == "--embedding" || arg == "--embeddings") { params.embedding = true; return true; } - if (arg == "--interactive-first") { + if (arg == "-if" || arg == "--interactive-first") { params.interactive_first = true; return true; } - if (arg == "-ins" || arg == "--instruct") { - params.instruct = true; - return true; - } if (arg == "-cnv" || arg == "--conversation") { params.conversation = true; return true; } - if (arg == "-cml" || arg == "--chatml") { - params.chatml = true; - return true; - } if (arg == "--infill") { params.infill = true; return true; @@ -964,7 +957,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.flash_attn = true; return true; } - if (arg == "--color") { + if (arg == "-co" || arg == "--color") { params.use_color = true; return true; } @@ -972,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.use_mlock = true; return true; } - if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; return true; } params.n_gpu_layers = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } return true; } - if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { if (++i >= argc) { invalid_param = true; return true; } params.n_gpu_layers_draft = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } return true; @@ -1087,6 +1080,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "-v" || arg == "--verbose") { + params.verbose = true; + return true; + } if (arg == "--verbose-prompt") { params.verbose_prompt = true; return true; @@ -1151,24 +1148,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.ppl_stride = std::stoi(argv[i]); return true; } - if (arg == "-ptc" || arg == "--print-token-count") { + if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; return true; } - params.n_print = std::stoi(argv[i]); - return true; - } - if (arg == "--check-tensors") { - params.check_tensors = true; + params.ppl_output_type = std::stoi(argv[i]); return true; } - if (arg == "--ppl-output-type") { + if (arg == "-ptc" || arg == "--print-token-count") { if (++i >= argc) { invalid_param = true; return true; } - params.ppl_output_type = std::stoi(argv[i]); + params.n_print = std::stoi(argv[i]); + return true; + } + if (arg == "--check-tensors") { + params.check_tensors = true; return true; } if (arg == "--hellaswag") { @@ -1242,19 +1239,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "-h" || arg == "--help") { - gpt_params_print_usage(argc, argv, gpt_params()); - exit(0); + if (arg == "-h" || arg == "--help" || arg == "--usage" ) { + params.usage = true; + return true; } if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); } - if (arg == "--random-prompt") { - params.random_prompt = true; - return true; - } if (arg == "--in-prefix-bos") { params.input_prefix_bos = true; return true; @@ -1321,6 +1314,229 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } + if (arg == "--host") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hostname = argv[i]; + return true; + } + if (arg == "--port") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.port = std::stoi(argv[i]); + return true; + } + if (arg == "--path") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.public_path = argv[i]; + return true; + } + if (arg == "--api-key") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.api_keys.push_back(argv[i]); + return true; + } + if (arg == "--api-key-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream key_file(argv[i]); + if (!key_file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); + return true; + } + if (arg == "--ssl-key-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ssl_file_key = argv[i]; + return true; + } + if (arg == "--ssl-cert-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ssl_file_cert = argv[i]; + return true; + } + if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.timeout_read = std::stoi(argv[i]); + params.timeout_write = std::stoi(argv[i]); + return true; + } + if (arg == "-spf" || arg == "--system-prompt-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::string system_prompt; + std::copy( + std::istreambuf_iterator<char>(file), + std::istreambuf_iterator<char>(), + std::back_inserter(system_prompt) + ); + params.system_prompt = system_prompt; + return true; + } + if (arg == "--log-format") { + if (++i >= argc) { + invalid_param = true; + return true; + } + if (std::strcmp(argv[i], "json") == 0) { + params.log_json = true; + } else if (std::strcmp(argv[i], "text") == 0) { + params.log_json = false; + } else { + invalid_param = true; + return true; + } + return true; + } + if (arg == "--no-slots") { + params.endpoint_slots = false; + return true; + } + if (arg == "--metrics") { + params.endpoint_metrics = true; + return true; + } + if (arg == "--slot-save-path") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.slot_save_path = argv[i]; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + return true; + } + if (arg == "--chat-template") { + if (++i >= argc) { + invalid_param = true; + return true; + } + if (!llama_chat_verify_template(argv[i])) { + fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); + fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); + invalid_param = true; + return true; + } + params.chat_template = argv[i]; + return true; + } + if (arg == "-pps") { + params.is_pp_shared = true; + return true; + } + if (arg == "-npp") { + if (++i >= argc) { + invalid_param = true; + return true; + } + auto p = string_split<int>(argv[i], split_delim); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + return true; + } + if (arg == "-ntg") { + if (++i >= argc) { + invalid_param = true; + return true; + } + auto p = string_split<int>(argv[i], split_delim); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + return true; + } + if (arg == "-npl") { + if (++i >= argc) { + invalid_param = true; + return true; + } + auto p = string_split<int>(argv[i], split_delim); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + return true; + } + if (arg == "--context-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + params.context_files.push_back(argv[i]); + return true; + } + if (arg == "--chunk-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.chunk_size = std::stoi(argv[i]); + return true; + } + if (arg == "--chunk-separator") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.chunk_separator = argv[i]; + return true; + } + if (arg == "--junk") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_junk = std::stoi(argv[i]); + return true; + } + if (arg == "--pos") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.i_pos = std::stoi(argv[i]); + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1348,6 +1564,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return false; } +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; @@ -1359,198 +1585,290 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param } sampler_type_names.pop_back(); - printf("\n"); - printf("usage: %s [options]\n", argv[0]); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" --version show version and build info\n"); - printf(" -i, --interactive run in interactive mode\n"); - printf(" --special special tokens output enabled\n"); - printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); - printf(" --interactive-first run in interactive mode and wait for input right away\n"); - printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); - printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); - printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); - printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); - printf(" -r PROMPT, --reverse-prompt PROMPT\n"); - printf(" halt generation at PROMPT, return control in interactive mode\n"); - printf(" (can be specified more than once for multiple prompts).\n"); - printf(" --color colorise output to distinguish prompt and user input from generations\n"); - printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); - printf(" -tb N, --threads-batch N\n"); - printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); - printf(" -td N, --threads-draft N"); - printf(" number of threads to use during generation (default: same as --threads)\n"); - printf(" -tbd N, --threads-batch-draft N\n"); - printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); - printf(" -p PROMPT, --prompt PROMPT\n"); - printf(" prompt to start generation with (default: empty)\n"); - printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); - printf(" not supported with --interactive or other interactive options\n"); - printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); - printf(" --random-prompt start with a randomized prompt.\n"); - printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); - printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); - printf(" -f FNAME, --file FNAME\n"); - printf(" prompt file to start generation.\n"); - printf(" -bf FNAME, --binary-file FNAME\n"); - printf(" binary file containing multiple choice tasks.\n"); - printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); - printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); - printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch); - printf(" -ub N, --ubatch-size N\n"); - printf(" physical maximum batch size (default: %d)\n", params.n_ubatch); - printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); - printf(" (default: %s)\n", sampler_type_names.c_str()); - printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); - printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); - printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); - printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); - printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); - printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); - printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); - printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); - printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); - printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); - printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range); - printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent); - printf(" --mirostat N use Mirostat sampling.\n"); - printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); - printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); - printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta); - printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau); - printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); - printf(" modifies the likelihood of token appearing in the completion,\n"); - printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); - printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); - printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); - printf(" --grammar-file FNAME file to read grammar from\n"); - printf(" -j SCHEMA, --json-schema SCHEMA\n"); - printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n"); - printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n"); - printf(" --cfg-negative-prompt PROMPT\n"); - printf(" negative prompt to use for guidance. (default: empty)\n"); - printf(" --cfg-negative-prompt-file FNAME\n"); - printf(" negative prompt file to use for guidance. (default: empty)\n"); - printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale); - printf(" --rope-scaling {none,linear,yarn}\n"); - printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); - printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); - printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); - printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n"); - printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); - printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); - printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); - printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); - printf(" --pooling {none,mean,cls}\n"); - printf(" pooling type for embeddings, use model default if unspecified\n"); - printf(" -dt N, --defrag-thold N\n"); - printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); - printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - printf(" --penalize-nl penalize newline tokens\n"); - printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); - printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n"); - printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); - printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); - printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); - printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); - printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); - printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); - printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n"); - printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); - printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); - printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); - printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); - printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); - printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); - printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled"); - printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); - printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n"); + struct option_info { + LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) + option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) { + va_list args_list; + va_start(args_list, desc); + char buffer[1024]; + vsnprintf(buffer, sizeof(buffer), desc, args_list); + va_end(args_list); + this->desc = buffer; + } + + option_info(const std::string & grp) : grp(grp) {} + + std::string tags; + std::string args; + std::string desc; + std::string grp; + }; + + std::vector<option_info> options; + + // TODO: filter by tags + + options.push_back({ "general" }); + options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); + options.push_back({ "*", " --version", "show version and build info" }); + options.push_back({ "*", "-v, --verbose", "print verbose information" }); + options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); + options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); + options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); + options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); + options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); + options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); + options.push_back({ "speculative", "-tbd, --threads-batch-draft N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); + options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); + options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); + options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)" }); + options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); + + options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); + options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); + options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); + options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); + options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); + options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); + options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); + options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); + options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); + options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); + options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); + options.push_back({ "*", " --no-escape", "do not process escape sequences" }); + options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print }); + options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" }); + options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n" + "not supported with --interactive or other interactive options" }); + options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" }); + options.push_back({ "main", "-r, --reverse-prompt PROMPT", + "halt generation at PROMPT, return control in interactive mode\n" + "can be specified more than once for multiple prompts" }); + options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); + options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" }); + options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); + options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); + options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); + options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); + options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); + options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + + options.push_back({ "sampling" }); + options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" + "(default: %s)", sampler_type_names.c_str() }); + options.push_back({ "*", " --sampling-seq SEQUENCE", + "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() }); + options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" }); + options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" }); + options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp }); + options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k }); + options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p }); + options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p }); + options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z }); + options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p }); + options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n }); + options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat }); + options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present }); + options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq }); + options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range }); + options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent }); + options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n" + "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat }); + options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta }); + options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau }); + options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); + options.push_back({ "main", " --cfg-negative-prompt PROMPT", + "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() }); + options.push_back({ "main", " --cfg-negative-prompt-file FNAME", + "negative prompt file to use for guidance" }); + options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); + + options.push_back({ "grammar" }); + options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); + options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); + options.push_back({ "*", "-j, --json-schema SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n" + "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); + + options.push_back({ "embedding" }); + options.push_back({ "embedding", " --pooling {none,mean,cls}", + "pooling type for embeddings, use model default if unspecified" }); + + options.push_back({ "context hacking" }); + options.push_back({ "*", " --rope-scaling {none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model" }); + options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" }); + options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" }); + options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" }); + options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx }); + options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor }); + options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor }); + options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow }); + options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast }); + options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n }); + options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w }); + options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" }); + options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" }); + options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() }); + options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() }); + + options.push_back({ "perplexity" }); + options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" }); + options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks }); + options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks }); + options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --multiple-choice-tasks N", + "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks }); + options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" }); + options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride }); + options.push_back({ "perplexity", " --ppl-output-type {0,1}", + "output type for perplexity calculation (default: %d)", params.ppl_output_type }); + + options.push_back({ "parallel" }); + options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold }); + options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); + options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); + options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); + + options.push_back({ "multi-modality" }); + options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); + options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); + + options.push_back({ "backend" }); + options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); if (llama_supports_mlock()) { - printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); + options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); } if (llama_supports_mmap()) { - printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); - } - printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); - printf(" - distribute: spread execution evenly over all nodes\n"); - printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); - printf(" - numactl: use the CPU map provided by numactl\n"); - printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); - printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); + options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" }); + } + options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n" + " - distribute: spread execution evenly over all nodes\n" + " - isolate: only spawn threads on CPUs on the node that execution started on\n" + " - numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggerganov/llama.cpp/issues/1437" }); + if (llama_supports_gpu_offload()) { - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -ngld N, --n-gpu-layers-draft N\n"); - printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf(" how to split the model across multiple GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf(" - layer (default): split layers and KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT, --tensor-split SPLIT\n"); - printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); - printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); - } - printf(" --rpc SERVERS comma separated list of RPC servers\n"); - printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); - printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); - printf(" -gan N, --grp-attn-n N\n"); - printf(" group-attention factor (default: %d)\n", params.grp_attn_n); - printf(" -gaw N, --grp-attn-w N\n"); - printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w); - printf(" -dkvc, --dump-kv-cache\n"); - printf(" verbose print of the KV cache\n"); - printf(" -nkvo, --no-kv-offload\n"); - printf(" disable KV offload\n"); - printf(" -ctk TYPE, --cache-type-k TYPE\n"); - printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str()); - printf(" -ctv TYPE, --cache-type-v TYPE\n"); - printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str()); - printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); - printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); - printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - printf(" --control-vector FNAME\n"); - printf(" add a control vector\n"); - printf(" --control-vector-scaled FNAME S\n"); - printf(" add a control vector with user defined scaling S\n"); - printf(" --control-vector-layer-range START END\n"); - printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); - printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH); - printf(" -md FNAME, --model-draft FNAME\n"); - printf(" draft model for speculative decoding (default: unused)\n"); - printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); - printf(" model download url (default: unused)\n"); - printf(" -hfr REPO, --hf-repo REPO\n"); - printf(" Hugging Face model repository (default: unused)\n"); - printf(" -hff FILE, --hf-file FILE\n"); - printf(" Hugging Face model file (default: unused)\n"); - printf(" -ld LOGDIR, --logdir LOGDIR\n"); - printf(" path under which to save YAML logs (no logging if unset)\n"); - printf(" -lcs FNAME, --lookup-cache-static FNAME\n"); - printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n"); - printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n"); - printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); - printf(" --override-kv KEY=TYPE:VALUE\n"); - printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); - printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -ptc N, --print-token-count N\n"); - printf(" print token count every N tokens (default: %d)\n", params.n_print); - printf(" --check-tensors check model tensor data for invalid values\n"); - printf("\n"); + options.push_back({ "*", "-ngl, --gpu-layers N", + "number of layers to store in VRAM" }); + options.push_back({ "*", "-ngld, --gpu-layers-draft N", + "number of layers to store in VRAM for the draft model" }); + options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", + "how to split the model across multiple GPUs, one of:\n" + " - none: use one GPU only\n" + " - layer (default): split layers and KV across GPUs\n" + " - row: split rows across GPUs" }); + options.push_back({ "*", "-ts, --tensor-split SPLIT", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); + options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" + "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); + } + + options.push_back({ "model" }); + options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" }); + options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); + options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); + options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); + options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); + options.push_back({ "*", " --control-vector FNAME", "add a control vector" }); + options.push_back({ "*", " --control-vector-scaled FNAME SCALE", + "add a control vector with user defined scaling SCALE" }); + options.push_back({ "*", " --control-vector-layer-range START END", + "layer range to apply the control vector(s) to, start and end inclusive" }); + options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" + "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH }); + options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" }); + options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); + options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); + options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); + + options.push_back({ "retrieval" }); + options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); + options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size }); + options.push_back({ "retrieval", " --chunk-separator STRING", + "separator between chunks (default: '%s')", params.chunk_separator.c_str() }); + + options.push_back({ "passkey" }); + options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); + options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); + + options.push_back({ "bench" }); + options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); + options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); + options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); + options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); + + options.push_back({ "server" }); + options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); + options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); + options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); + options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); + options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); + options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); + options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); + options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); + options.push_back({ "server", " --system-prompt-file FNAME", + "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); + options.push_back({ "server", " --log-format {text,json}", + "log output format: json or text (default: json)" }); + options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" }); + options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" }); + options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" }); + options.push_back({ "server", " --chat-template JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "only commonly used templates are accepted:\n" + "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); + #ifndef LOG_DISABLE_LOGS - log_print_usage(); + options.push_back({ "logging" }); + options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" }); + options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" }); + options.push_back({ "logging", " --log-test", "Run simple logging test" }); + options.push_back({ "logging", " --log-disable", "Disable trace logs" }); + options.push_back({ "logging", " --log-enable", "Enable trace logs" }); + options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" }); + options.push_back({ "logging", " --log-new", "Create a separate new log file on start. " + "Each log file will have unique name: \"<name>.<ID>.log\"" }); + options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); #endif // LOG_DISABLE_LOGS + + printf("usage: %s [options]\n", argv[0]); + + for (const auto & o : options) { + if (!o.grp.empty()) { + printf("\n%s:\n\n", o.grp.c_str()); + continue; + } + printf(" %-32s", o.args.c_str()); + if (o.args.length() > 30) { + printf("\n%34s", ""); + } + + const auto desc = o.desc; + size_t start = 0; + size_t end = desc.find('\n'); + while (end != std::string::npos) { + printf("%s\n%34s", desc.substr(start, end - start).c_str(), ""); + start = end + 1; + end = desc.find('\n', start); + } + + printf("%s\n", desc.substr(start).c_str()); + } + printf("\n"); } std::string gpt_params_get_system_info(const gpt_params & params) { @@ -1610,24 +1928,6 @@ std::string string_get_sortable_timestamp() { return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } -std::string string_random_prompt(std::mt19937 & rng) { - const int r = rng() % 10; - switch (r) { - case 0: return "So"; - case 1: return "Once upon a time"; - case 2: return "When"; - case 3: return "The"; - case 4: return "After"; - case 5: return "If"; - case 6: return "import"; - case 7: return "He"; - case 8: return "She"; - case 9: return "They"; - } - - GGML_UNREACHABLE(); -} - void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -2503,6 +2803,12 @@ bool llama_should_add_bos_token(const llama_model * model) { return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); } +bool llama_chat_verify_template(const std::string & tmpl) { + llama_chat_message chat[] = {{"user", "test"}}; + int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); + return res >= 0; +} + // // KV cache utils // @@ -2902,9 +3208,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); - fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); - fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); fprintf(stream, "keep: %d # default: 0\n", params.n_keep); fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); @@ -2954,7 +3258,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); - fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); fprintf(stream, "reverse_prompt:\n"); |