diff options
Diffstat (limited to 'common')
-rw-r--r-- | common/common.cpp | 75 | ||||
-rw-r--r-- | common/common.h | 99 |
2 files changed, 129 insertions, 45 deletions
diff --git a/common/common.cpp b/common/common.cpp index c8df9a4c..601bd216 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -273,6 +273,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); + params = params_org; return false; } @@ -408,6 +409,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } + if (arg == "--in-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + params.in_files.push_back(argv[i]); + return true; + } if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { if (++i >= argc) { invalid_param = true; @@ -1081,7 +1096,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-v" || arg == "--verbose") { - params.verbose = true; + params.verbosity = 1; + return true; + } + if (arg == "--verbosity") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.verbosity = std::stoi(argv[i]); return true; } if (arg == "--verbose-prompt") { @@ -1537,6 +1560,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.i_pos = std::stoi(argv[i]); return true; } + if (arg == "-o" || arg == "--output" || arg == "--output-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.out_file = argv[i]; + return true; + } + if (arg == "-ofreq" || arg == "--output-frequency") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_out_freq = std::stoi(argv[i]); + return true; + } + if (arg == "--save-frequency") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_save_freq = std::stoi(argv[i]); + return true; + } + if (arg == "--process-output") { + params.process_output = true; + return true; + } + if (arg == "--no-ppl") { + params.compute_ppl = false; + return true; + } + if (arg == "--chunk" || arg == "--from-chunk") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.i_chunk = std::stoi(argv[i]); + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1612,6 +1675,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); options.push_back({ "*", " --version", "show version and build info" }); options.push_back({ "*", "-v, --verbose", "print verbose information" }); + options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity }); options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); @@ -1637,6 +1701,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); + options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); options.push_back({ "*", " --no-escape", "do not process escape sequences" }); @@ -1804,6 +1869,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); + options.push_back({ "imatrix" }); + options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() }); + options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq }); + options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); + options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); + options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); + options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk }); + options.push_back({ "bench" }); options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); diff --git a/common/common.h b/common/common.h index e0a08a61..de6238e2 100644 --- a/common/common.h +++ b/common/common.h @@ -56,43 +56,42 @@ struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed int32_t n_threads = cpu_get_num_math(); - int32_t n_threads_draft = -1; - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_threads_batch_draft = -1; - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 0; // context size - int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 5; // number of tokens to draft during speculative decoding - int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_parallel = 1; // number of parallel sequences to decode - int32_t n_sequences = 1; // number of sequences to decode - float p_split = 0.1f; // speculative decoding split probability - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. - int32_t grp_attn_n = 1; // group-attention factor - int32_t grp_attn_w = 512; // group-attention width - int32_t n_print = -1; // print token count every n tokens (-1 = disabled) - float rope_freq_base = 0.0f; // RoPE base frequency - float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + int32_t n_threads_draft = -1; + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 0; // context size + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 5; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode + float p_split = 0.1f; // speculative decoding split probability + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_beams = 0; // if non-zero then use beam search of given width. + int32_t grp_attn_n = 1; // group-attention factor + int32_t grp_attn_w = 512; // group-attention width + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor - float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor float yarn_beta_fast = 32.0f; // YaRN low correction dim - float yarn_beta_slow = 1.0f; // YaRN high correction dim - int32_t yarn_orig_ctx = 0; // YaRN original context length + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold - std::string rpc_servers = ""; // comma separated list of RPC servers ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; + enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings @@ -114,7 +113,9 @@ struct gpt_params { std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding std::string logits_file = ""; // file for saving *all* logits + std::string rpc_servers = ""; // comma separated list of RPC servers + std::vector<std::string> in_files; // all input files std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<llama_model_kv_override> kv_overrides; @@ -124,23 +125,24 @@ struct gpt_params { std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale + int32_t verbosity = 0; int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector - int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. - int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line - // (which is more convenient to use for plotting) - // - bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt - size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score + int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. + int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line + // (which is more convenient to use for plotting) + // + bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score - bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt - size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt + size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed - bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt - size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed + bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt + size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed - bool kl_divergence = false; // compute KL divergence + bool kl_divergence = false; // compute KL divergence bool usage = false; // print usage bool use_color = false; // use color to distinguish generations and inputs @@ -163,7 +165,6 @@ struct gpt_params { bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory - bool verbose = false; bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode @@ -180,10 +181,10 @@ struct gpt_params { std::vector<std::string> image; // path to image file(s) // server params - int32_t port = 8080; - int32_t timeout_read = 600; - int32_t timeout_write = timeout_read; - int32_t n_threads_http = -1; + int32_t port = 8080; // server listens on this network port + int32_t timeout_read = 600; // http read timeout in seconds + int32_t timeout_write = timeout_read; // http write timeout in seconds + int32_t n_threads_http = -1; // number of threads to use for http server (-1 = use n_threads) std::string hostname = "127.0.0.1"; std::string public_path = ""; @@ -219,6 +220,16 @@ struct gpt_params { // passkey params int32_t n_junk = 250; // number of times to repeat the junk text int32_t i_pos = -1; // position of the passkey in the junk text + + // imatrix params + std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file + + int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations + int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations + int32_t i_chunk = 0; // start processing from this chunk + + bool process_output = false; // collect data for the output tensor + bool compute_ppl = true; // whether to compute perplexity }; void gpt_params_handle_model_default(gpt_params & params); |