diff options
Diffstat (limited to 'common/common.cpp')
-rw-r--r-- | common/common.cpp | 1190 |
1 files changed, 479 insertions, 711 deletions
diff --git a/common/common.cpp b/common/common.cpp index 8eb23ade..4d1d88c6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1,3 +1,7 @@ +#if defined(_MSC_VER) +#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +#endif + #include "common.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT @@ -190,6 +194,12 @@ int32_t cpu_get_num_math() { // CLI argument parsing // +void gpt_params_handle_hf_token(gpt_params & params) { + if (params.hf_token.empty() && std::getenv("HF_TOKEN")) { + params.hf_token = std::getenv("HF_TOKEN"); + } +} + void gpt_params_handle_model_default(gpt_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model @@ -237,6 +247,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { gpt_params_handle_model_default(params); + gpt_params_handle_hf_token(params); + if (params.escape) { string_process_escapes(params.prompt); string_process_escapes(params.input_prefix); @@ -273,26 +285,22 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } +#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } + bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { const char split_delim = ','; llama_sampling_params & sparams = params.sparams; if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. params.seed = std::stoul(argv[i]); sparams.seed = std::stoul(argv[i]); return true; } if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads = std::stoi(argv[i]); if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); @@ -300,10 +308,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-tb" || arg == "--threads-batch") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_batch = std::stoi(argv[i]); if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); @@ -311,10 +316,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-td" || arg == "--threads-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_draft = std::stoi(argv[i]); if (params.n_threads_draft <= 0) { params.n_threads_draft = std::thread::hardware_concurrency(); @@ -322,10 +324,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-tbd" || arg == "--threads-batch-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_batch_draft = std::stoi(argv[i]); if (params.n_threads_batch_draft <= 0) { params.n_threads_batch_draft = std::thread::hardware_concurrency(); @@ -333,10 +332,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-p" || arg == "--prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.prompt = argv[i]; return true; } @@ -349,10 +345,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--prompt-cache") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.path_prompt_cache = argv[i]; return true; } @@ -365,10 +358,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-bf" || arg == "--binary-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i], std::ios::binary); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -384,10 +374,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-f" || arg == "--file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -403,10 +390,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--in-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -417,66 +401,42 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_predict = std::stoi(argv[i]); return true; } if (arg == "--top-k") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.top_k = std::stoi(argv[i]); return true; } if (arg == "-c" || arg == "--ctx-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_ctx = std::stoi(argv[i]); return true; } if (arg == "--grp-attn-n" || arg == "-gan") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.grp_attn_n = std::stoi(argv[i]); return true; } if (arg == "--grp-attn-w" || arg == "-gaw") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.grp_attn_w = std::stoi(argv[i]); return true; } if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_base = std::stof(argv[i]); return true; } if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_scale = std::stof(argv[i]); return true; } if (arg == "--rope-scaling") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } @@ -485,58 +445,37 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--rope-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_scale = 1.0f / std::stof(argv[i]); return true; } if (arg == "--yarn-orig-ctx") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_orig_ctx = std::stoi(argv[i]); return true; } if (arg == "--yarn-ext-factor") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_ext_factor = std::stof(argv[i]); return true; } if (arg == "--yarn-attn-factor") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_attn_factor = std::stof(argv[i]); return true; } if (arg == "--yarn-beta-fast") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_beta_fast = std::stof(argv[i]); return true; } if (arg == "--yarn-beta-slow") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_beta_slow = std::stof(argv[i]); return true; } if (arg == "--pooling") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } @@ -545,158 +484,109 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "--attention") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + else { invalid_param = true; } + return true; + } if (arg == "--defrag-thold" || arg == "-dt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.defrag_thold = std::stof(argv[i]); return true; } if (arg == "--samplers") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const auto sampler_names = string_split(argv[i], ';'); sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); return true; } if (arg == "--sampling-seq") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); return true; } if (arg == "--top-p") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.top_p = std::stof(argv[i]); return true; } if (arg == "--min-p") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.min_p = std::stof(argv[i]); return true; } if (arg == "--temp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.temp = std::stof(argv[i]); sparams.temp = std::max(sparams.temp, 0.0f); return true; } if (arg == "--tfs") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.tfs_z = std::stof(argv[i]); return true; } if (arg == "--typical") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.typical_p = std::stof(argv[i]); return true; } if (arg == "--repeat-last-n") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_last_n = std::stoi(argv[i]); sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); return true; } if (arg == "--repeat-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_repeat = std::stof(argv[i]); return true; } if (arg == "--frequency-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_freq = std::stof(argv[i]); return true; } if (arg == "--presence-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_present = std::stof(argv[i]); return true; } if (arg == "--dynatemp-range") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.dynatemp_range = std::stof(argv[i]); return true; } if (arg == "--dynatemp-exp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.dynatemp_exponent = std::stof(argv[i]); return true; } if (arg == "--mirostat") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat = std::stoi(argv[i]); return true; } if (arg == "--mirostat-lr") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat_eta = std::stof(argv[i]); return true; } if (arg == "--mirostat-ent") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat_tau = std::stof(argv[i]); return true; } if (arg == "--cfg-negative-prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.cfg_negative_prompt = argv[i]; return true; } if (arg == "--cfg-negative-prompt-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -710,203 +600,126 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--cfg-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.cfg_scale = std::stof(argv[i]); return true; } if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_batch = std::stoi(argv[i]); return true; } if (arg == "-ub" || arg == "--ubatch-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_ubatch = std::stoi(argv[i]); return true; } if (arg == "--keep") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_keep = std::stoi(argv[i]); return true; } if (arg == "--draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_draft = std::stoi(argv[i]); return true; } if (arg == "--chunks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_chunks = std::stoi(argv[i]); return true; } if (arg == "-np" || arg == "--parallel") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_parallel = std::stoi(argv[i]); return true; } if (arg == "-ns" || arg == "--sequences") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_sequences = std::stoi(argv[i]); return true; } if (arg == "--p-split" || arg == "-ps") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.p_split = std::stof(argv[i]); return true; } if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model = argv[i]; return true; } if (arg == "-md" || arg == "--model-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_draft = argv[i]; return true; } if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_alias = argv[i]; return true; } if (arg == "-mu" || arg == "--model-url") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_url = argv[i]; return true; } - if (arg == "-hfr" || arg == "--hf-repo") { + if (arg == "-hft" || arg == "--hf-token") { if (++i >= argc) { - invalid_param = true; - return true; + invalid_param = true; + return true; } + params.hf_token = argv[i]; + return true; + } + if (arg == "-hfr" || arg == "--hf-repo") { + CHECK_ARG params.hf_repo = argv[i]; return true; } if (arg == "-hff" || arg == "--hf-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hf_file = argv[i]; return true; } if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lora_adapter.emplace_back(argv[i], 1.0f); - params.use_mmap = false; return true; } if (arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const char* lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); - params.use_mmap = false; - return true; - } - if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } - params.lora_base = argv[i]; return true; } if (arg == "--control-vector") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vectors.push_back({ 1.0f, argv[i], }); return true; } if (arg == "--control-vector-scaled") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const char* fname = argv[i]; - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vectors.push_back({ std::stof(argv[i]), fname, }); return true; } if (arg == "--control-vector-layer-range") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vector_layer_start = std::stoi(argv[i]); - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vector_layer_end = std::stoi(argv[i]); return true; } if (arg == "--mmproj") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.mmproj = argv[i]; return true; } if (arg == "--image") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.image.emplace_back(argv[i]); return true; } @@ -922,6 +735,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.embedding = true; return true; } + if (arg == "--embd-normalize") { + CHECK_ARG + params.embd_normalize = std::stoi(argv[i]); + return true; + } + if (arg == "--embd-output-format") { + CHECK_ARG + params.embd_out = argv[i]; + return true; + } + if (arg == "--embd-separator") { + CHECK_ARG + params.embd_sep = argv[i]; + return true; + } if (arg == "-if" || arg == "--interactive-first") { params.interactive_first = true; return true; @@ -950,7 +778,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cache_type_v = argv[++i]; return true; } - if (arg == "--multiline-input") { + if (arg == "-mli" || arg == "--multiline-input") { params.multiline_input = true; return true; } @@ -962,6 +790,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cont_batching = true; return true; } + if (arg == "-nocb" || arg == "--no-cont-batching") { + params.cont_batching = false; + return true; + } if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; return true; @@ -975,10 +807,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_gpu_layers = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); @@ -987,10 +816,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_gpu_layers_draft = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); @@ -999,10 +825,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--main-gpu" || arg == "-mg") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.main_gpu = std::stoi(argv[i]); #ifndef GGML_USE_CUDA_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); @@ -1010,10 +833,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--split-mode" || arg == "-sm") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string arg_next = argv[i]; if (arg_next == "none") { params.split_mode = LLAMA_SPLIT_MODE_NONE; @@ -1038,10 +858,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--tensor-split" || arg == "-ts") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string arg_next = argv[i]; // split string by , and / @@ -1066,10 +883,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--rpc") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rpc_servers = argv[i]; return true; } @@ -1078,10 +892,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--numa") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } @@ -1094,10 +905,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--verbosity") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.verbosity = std::stoi(argv[i]); return true; } @@ -1110,18 +918,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-r" || arg == "--reverse-prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.antiprompt.emplace_back(argv[i]); return true; } if (arg == "-ld" || arg == "--logdir") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.logdir = argv[i]; if (params.logdir.back() != DIRECTORY_SEPARATOR) { @@ -1130,26 +932,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-lcs" || arg == "--lookup-cache-static") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lookup_cache_static = argv[i]; return true; } if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lookup_cache_dynamic = argv[i]; return true; } if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.logits_file = argv[i]; return true; } @@ -1158,26 +951,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--ppl-stride") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ppl_stride = std::stoi(argv[i]); return true; } if (arg == "--ppl-output-type") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ppl_output_type = std::stoi(argv[i]); return true; } if (arg == "-ptc" || arg == "--print-token-count") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_print = std::stoi(argv[i]); return true; } @@ -1190,10 +974,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--hellaswag-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hellaswag_tasks = std::stoi(argv[i]); return true; } @@ -1202,10 +983,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--winogrande-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.winogrande_tasks = std::stoi(argv[i]); return true; } @@ -1214,10 +992,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--multiple-choice-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.multiple_choice_tasks = std::stoi(argv[i]); return true; } @@ -1234,10 +1009,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-l" || arg == "--logit-bias") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::stringstream ss(argv[i]); llama_token key; char sign; @@ -1267,37 +1039,32 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--in-prefix-bos") { params.input_prefix_bos = true; + params.enable_chat_template = false; return true; } if (arg == "--in-prefix") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.input_prefix = argv[i]; + params.enable_chat_template = false; return true; } if (arg == "--in-suffix") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.input_suffix = argv[i]; + params.enable_chat_template = false; + return true; + } + if (arg == "--spm-infill") { + params.spm_infill = true; return true; } if (arg == "--grammar") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.grammar = argv[i]; return true; } if (arg == "--grammar-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1312,18 +1079,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-j" || arg == "--json-schema") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); return true; } if (arg == "--override-kv") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!string_parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; @@ -1332,42 +1093,27 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--host") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hostname = argv[i]; return true; } if (arg == "--port") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.port = std::stoi(argv[i]); return true; } if (arg == "--path") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.public_path = argv[i]; return true; } if (arg == "--api-key") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.api_keys.push_back(argv[i]); return true; } if (arg == "--api-key-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream key_file(argv[i]); if (!key_file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1384,43 +1130,28 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--ssl-key-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ssl_file_key = argv[i]; return true; } if (arg == "--ssl-cert-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ssl_file_cert = argv[i]; return true; } if (arg == "--timeout" || arg == "-to") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.timeout_read = std::stoi(argv[i]); params.timeout_write = std::stoi(argv[i]); return true; } if (arg == "--threads-http") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_http = std::stoi(argv[i]); return true; } if (arg == "-spf" || arg == "--system-prompt-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1437,10 +1168,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--log-format") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (std::strcmp(argv[i], "json") == 0) { params.log_json = true; } else if (std::strcmp(argv[i], "text") == 0) { @@ -1460,10 +1188,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--slot-save-path") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.slot_save_path = argv[i]; // if doesn't end with DIRECTORY_SEPARATOR, add it if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { @@ -1472,10 +1197,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chat-template") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!llama_chat_verify_template(argv[i])) { fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); @@ -1486,10 +1208,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--slot-prompt-similarity" || arg == "-sps") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.slot_prompt_similarity = std::stof(argv[i]); return true; } @@ -1498,37 +1217,25 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-npp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split<int>(argv[i], split_delim); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); return true; } if (arg == "-ntg") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split<int>(argv[i], split_delim); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); return true; } if (arg == "-npl") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split<int>(argv[i], split_delim); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); return true; } if (arg == "--context-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i], std::ios::binary); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1539,59 +1246,39 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chunk-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.chunk_size = std::stoi(argv[i]); return true; } if (arg == "--chunk-separator") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.chunk_separator = argv[i]; return true; } if (arg == "--junk") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_junk = std::stoi(argv[i]); return true; } if (arg == "--pos") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.i_pos = std::stoi(argv[i]); return true; } if (arg == "-o" || arg == "--output" || arg == "--output-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.out_file = argv[i]; params.cvector_outfile = argv[i]; + params.lora_outfile = argv[i]; return true; } if (arg == "-ofreq" || arg == "--output-frequency") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_out_freq = std::stoi(argv[i]); return true; } if (arg == "--save-frequency") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_save_freq = std::stoi(argv[i]); return true; } @@ -1612,62 +1299,39 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chunk" || arg == "--from-chunk") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.i_chunk = std::stoi(argv[i]); return true; } // cvector params - if (arg == "--completions-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } - params.cvector_completions_file = argv[i]; - return true; - } if (arg == "--positive-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.cvector_positive_file = argv[i]; return true; } if (arg == "--negative-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.cvector_negative_file = argv[i]; return true; } - if (arg == "--completions") { - if (++i >= argc) { - invalid_param = true; - return true; - } - params.n_completions = std::stoi(argv[i]); - return true; - } if (arg == "--pca-batch") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_pca_batch = std::stoi(argv[i]); return true; } if (arg == "--pca-iter") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_pca_iterations = std::stoi(argv[i]); return true; } + if (arg == "--method") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { invalid_param = true; } + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1679,10 +1343,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa // We have a matching known parameter requiring an argument, // now we need to check if there is anything after this argv // and flag invalid_param or parse it. - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { invalid_param = true; return true; @@ -1767,7 +1428,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); - options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); + options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n" + "in conversation mode, this will be used as system prompt\n" + "(default: '%s')", params.prompt.c_str() }); options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); @@ -1782,13 +1445,17 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "halt generation at PROMPT, return control in interactive mode\n" "can be specified more than once for multiple prompts" }); options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); - options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" }); + options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n" + "if suffix/prefix are not specified, default chat template will be used\n" + "(default: %s)", params.conversation ? "true" : "false" }); options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "server infill", + " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); options.push_back({ "sampling" }); options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" @@ -1822,7 +1489,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main", " --cfg-negative-prompt-file FNAME", "negative prompt file to use for guidance" }); options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); - + options.push_back({ "main", " --chat-template JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted:\n" + "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "grammar" }); options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); @@ -1831,8 +1502,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); options.push_back({ "embedding" }); - options.push_back({ "embedding", " --pooling {none,mean,cls}", + options.push_back({ "embedding", " --pooling {none,mean,cls,last}", "pooling type for embeddings, use model default if unspecified" }); + options.push_back({ "embedding", " --attention {causal,non-causal}", + "attention type for embeddings, use model default if unspecified" }); options.push_back({ "context hacking" }); options.push_back({ "*", " --rope-scaling {none,linear,yarn}", @@ -1871,6 +1544,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); + options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" }); options.push_back({ "multi-modality" }); options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); @@ -1913,12 +1587,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); - options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); - options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); - options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); - options.push_back({ "*", " --control-vector FNAME", "add a control vector" }); + options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); + options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" + "note: this argument can be repeated to add multiple control vectors" }); options.push_back({ "*", " --control-vector-scaled FNAME SCALE", - "add a control vector with user defined scaling SCALE" }); + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors" }); options.push_back({ "*", " --control-vector-layer-range START END", "layer range to apply the control vector(s) to, start and end inclusive" }); options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" @@ -1927,6 +1602,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); + options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" }); options.push_back({ "retrieval" }); options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); @@ -1952,6 +1628,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); + options.push_back({ "embedding" }); + options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize }); + options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" }); + options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" }); + options.push_back({ "server" }); options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); @@ -1994,11 +1675,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "cvector", " --completions-file FNAME", - "completions file (default: '%s')", params.cvector_completions_file.c_str() }); - options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); - options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); + + options.push_back({ "export-lora" }); + options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); + options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads }); + options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); printf("usage: %s [options]\n", argv[0]); @@ -2363,9 +2049,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par llama_model * model = nullptr; if (!params.hf_repo.empty() && !params.hf_file.empty()) { - model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); + model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); } else if (!params.model_url.empty()) { - model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); } else { model = llama_load_model_from_file(params.model.c_str(), mparams); } @@ -2411,19 +2097,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, - ((i > 0) || params.lora_base.empty()) - ? NULL - : params.lora_base.c_str(), - params.n_threads); - if (err != 0) { + auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); + if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } + llama_lora_adapter_set(lctx, adapter, lora_scale); } if (params.ignore_eos) { @@ -2433,7 +2114,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par if (params.warmup) { LOG("warming up the model with an empty run\n"); - std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), }; + std::vector<llama_token> tmp; + llama_token bos = llama_token_bos(model); + llama_token eos = llama_token_eos(model); + // some models (e.g. T5) don't have a BOS token + if (bos != -1) { + tmp.push_back(bos); + } + tmp.push_back(eos); + + if (llama_model_has_encoder(model)) { + llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0)); + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == -1) { + decoder_start_token_id = bos; + } + tmp.clear(); + tmp.push_back(decoder_start_token_id); + } llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_kv_cache_clear(lctx); llama_synchronize(lctx); @@ -2516,6 +2214,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.pooling_type = params.pooling_type; + cparams.attention_type = params.attention_type; cparams.defrag_thold = params.defrag_thold; cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; @@ -2535,7 +2234,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) { return str.rfind(prefix, 0) == 0; } -static bool llama_download_file(const std::string & url, const std::string & path) { +static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { // Initialize libcurl std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); @@ -2550,6 +2249,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); + // Check if hf-token or bearer-token was specified + if (!hf_token.empty()) { + std::string auth_header = "Authorization: Bearer "; + auth_header += hf_token.c_str(); + struct curl_slist *http_headers = NULL; + http_headers = curl_slist_append(http_headers, auth_header.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers); + } + #if defined(_WIN32) // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of // operating system. Currently implemented under MS-Windows. @@ -2745,6 +2453,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat struct llama_model * llama_load_model_from_url( const char * model_url, const char * path_model, + const char * hf_token, const struct llama_model_params & params) { // Basic validation of the model_url if (!model_url || strlen(model_url) == 0) { @@ -2752,7 +2461,7 @@ struct llama_model * llama_load_model_from_url( return NULL; } - if (!llama_download_file(model_url, path_model)) { + if (!llama_download_file(model_url, path_model, hf_token)) { return NULL; } @@ -2800,14 +2509,14 @@ struct llama_model * llama_load_model_from_url( // Prepare download in parallel std::vector<std::future<bool>> futures_download; for (int idx = 1; idx < n_split; idx++) { - futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool { + futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool { char split_path[PATH_MAX] = {0}; llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); - return llama_download_file(split_url, split_path); + return llama_download_file(split_url, split_path, hf_token); }, idx)); } @@ -2826,6 +2535,7 @@ struct llama_model * llama_load_model_from_hf( const char * repo, const char * model, const char * path_model, + const char * hf_token, const struct llama_model_params & params) { // construct hugging face model url: // @@ -2841,7 +2551,7 @@ struct llama_model * llama_load_model_from_hf( model_url += "/resolve/main/"; model_url += model; - return llama_load_model_from_url(model_url.c_str(), path_model, params); + return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params); } #else @@ -2849,6 +2559,7 @@ struct llama_model * llama_load_model_from_hf( struct llama_model * llama_load_model_from_url( const char * /*model_url*/, const char * /*path_model*/, + const char * /*hf_token*/, const struct llama_model_params & /*params*/) { fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; @@ -2858,6 +2569,7 @@ struct llama_model * llama_load_model_from_hf( const char * /*repo*/, const char * /*model*/, const char * /*path_model*/, + const char * /*hf_token*/, const struct llama_model_params & /*params*/) { fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); return nullptr; @@ -2922,51 +2634,35 @@ std::vector<llama_token> llama_tokenize( } std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { - std::vector<char> result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); - if (n_tokens < 0) { - result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); - GGML_ASSERT(check == -n_tokens); - } else { - result.resize(n_tokens); - } - - return std::string(result.data(), result.size()); -} - -std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) { - const llama_token bos_id = llama_token_bos(llama_get_model(ctx)); - std::string piece; - std::string result; - - for (size_t i = 0; i < tokens.size(); ++i) { - piece = llama_token_to_piece(ctx, tokens[i]); - - // remove the leading space of the first non-BOS token - if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { - piece = piece.substr(1); - } - - result += piece; + piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' + const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); + if (n_chars < 0) { + piece.resize(-n_chars); + int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); + GGML_ASSERT(check == -n_chars); + } + else { + piece.resize(n_chars); } - return result; + return piece; } -std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) { - std::string piece; - std::string result; - - for (size_t i = 0; i < tokens.size(); ++i) { - piece = llama_token_to_piece(ctx, tokens[i]); - - result += piece; +std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) { + std::string text; + text.resize(std::max(text.capacity(), tokens.size())); + int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + if (n_chars < 0) { + text.resize(-n_chars); + n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } + text.resize(n_chars); + // NOTE: the original tokenizer decodes bytes after collecting the pieces. - return result; + return text; } bool llama_should_add_bos_token(const llama_model * model) { @@ -2975,12 +2671,91 @@ bool llama_should_add_bos_token(const llama_model * model) { return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); } +// +// Chat template utils +// + bool llama_chat_verify_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); return res >= 0; } +std::string llama_chat_apply_template(const struct llama_model * model, + const std::string & tmpl, + const std::vector<llama_chat_msg> & msgs, + bool add_ass) { + int alloc_size = 0; + bool fallback = false; // indicate if we must fallback to default chatml + std::vector<llama_chat_message> chat; + for (auto & msg : msgs) { + chat.push_back({msg.role.c_str(), msg.content.c_str()}); + alloc_size += (msg.role.size() + msg.content.size()) * 1.25; + } + + const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); + std::vector<char> buf(alloc_size); + + // run the first time to get the total output length + int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + + // error: chat template is not supported + if (res < 0) { + if (ptr_tmpl != nullptr) { + // if the custom "tmpl" is not supported, we throw an error + // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() + throw std::runtime_error("this custom template is not supported"); + } else { + // If the built-in template is not supported, we default to chatml + res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + fallback = true; + } + } + + // if it turns out that our buffer is too small, we resize it + if ((size_t) res > buf.size()) { + buf.resize(res); + res = llama_chat_apply_template( + fallback ? nullptr : model, + fallback ? "chatml" : ptr_tmpl, + chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + } + + std::string formatted_chat(buf.data(), res); + return formatted_chat; +} + +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector<llama_chat_msg> & past_msg, + const llama_chat_msg & new_msg, + bool add_ass) { + std::ostringstream ss; + auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false); + std::vector<llama_chat_msg> chat_new(past_msg); + // if the past_msg ends with a newline, we must preserve it in the formatted version + if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { + ss << "\n"; + }; + // format chat with new_msg + chat_new.push_back(new_msg); + auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); + // get the diff part + ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); + return ss.str(); +} + +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl) { + std::vector<llama_chat_msg> msgs = { + {"system", "You are a helpful assistant"}, + {"user", "Hello"}, + {"assistant", "Hi there"}, + {"user", "How are you?"}, + }; + return llama_chat_apply_template(model, tmpl, msgs, true); +} + // // KV cache utils // @@ -3060,14 +2835,34 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n) { +void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) { double sum = 0.0; - for (int i = 0; i < n; i++) { - sum += inp[i] * inp[i]; + + switch (embd_norm) { + case -1: // no normalisation + sum = 1.0; + break; + case 0: // max absolute + for (int i = 0; i < n; i++) { + if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); + } + sum /= 32760.0; // make an int16 range + break; + case 2: // euclidean + for (int i = 0; i < n; i++) { + sum += inp[i] * inp[i]; + } + sum = std::sqrt(sum); + break; + default: // p-norm (euclidean is p-norm p=2) + for (int i = 0; i < n; i++) { + sum += std::pow(std::abs(inp[i]), embd_norm); + } + sum = std::pow(sum, 1.0 / embd_norm); + break; } - sum = sqrt(sum); - const float norm = sum > 0.0 ? 1.0f / sum : 0.0f; + const float norm = sum > 0.0 ? 1.0 / sum : 0.0f; for (int i = 0; i < n; i++) { out[i] = inp[i] * norm; @@ -3085,6 +2880,14 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) sum2 += embd2[i] * embd2[i]; } + // Handle the case where one or both vectors are zero vectors + if (sum1 == 0.0 || sum2 == 0.0) { + if (sum1 == 0.0 && sum2 == 0.0) { + return 1.0f; // two zero vectors are similar + } + return 0.0f; + } + return sum / (sqrt(sum1) * sqrt(sum2)); } @@ -3093,125 +2896,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { - int32_t n_tensors; - - size_t n_bytes = 0; - - uint32_t max_direction_layer = 0; - llama_control_vector_data result = { -1, {} }; - // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer - { - struct ggml_init_params meta_params = { - /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ true, - }; - ggml_context * meta_ctx = ggml_init(meta_params); - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &meta_ctx, - }; - struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); - if (!meta_ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - return result; - } - - n_tensors = gguf_get_n_tensors(meta_ctx_gguf); - for (int i = 0; i < n_tensors; i++) { - std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); - - // split on '.' - size_t dotpos = name.find('.'); - if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { - try { - uint32_t layer = std::stoi(name.substr(dotpos + 1)); - if (layer == 0) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - if (layer > max_direction_layer) { - max_direction_layer = layer; - } - } catch (...) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - } - - struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); - if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - if (result.n_embd == -1) { - result.n_embd = ggml_nelements(tensor_meta); - } else if (ggml_nelements(tensor_meta) != result.n_embd) { - fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - n_bytes += ggml_nbytes(tensor_meta); - } - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); + ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); + return result; } + int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); if (n_tensors == 0) { fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); - return result; } - // load and scale tensors into final control vector context - struct ggml_init_params ggml_params = { - /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ false, - }; - struct ggml_context * ctx = ggml_init(ggml_params); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(ctx_gguf, i); - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx, - }; - struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); - if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); - ggml_free(ctx); - return result; - } + int layer_idx = -1; + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + layer_idx = std::stoi(name.substr(dotpos + 1)); + } catch (...) { + layer_idx = -1; + } + } + if (layer_idx < 0) { + fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } else if (layer_idx == 0) { + fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } - // do not store data for layer 0 (it's not used) - result.data.resize(result.n_embd * max_direction_layer); + struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + if (ggml_n_dims(tensor) != 1) { + fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } - for (uint32_t il = 1; il <= max_direction_layer; il++) { - const std::string name = "direction." + std::to_string(il); - const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor); + } else if (ggml_nelements(tensor) != result.n_embd) { + fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } - float * dst = result.data.data() + result.n_embd * (il - 1); + // extend if necessary - do not store data for layer 0 (it's not used) + result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f); - if (tensor) { - const float * src = (const float *) tensor->data; - for (int j = 0; j < result.n_embd; j++) { - dst[j] = src[j] * load_info.strength; - } - } else { - for (int j = 0; j < result.n_embd; j++) { - dst[j] = 0.0f; - } + const float * src = (const float *) tensor->data; + float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0] + for (int j = 0; j < result.n_embd; j++) { + dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file } + } + if (result.n_embd == -1) { + fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); + result.data.clear(); + } + + gguf_free(ctx_gguf); + ggml_free(ctx); + return result; } @@ -3222,16 +2987,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont auto cur = llama_control_vector_load_one(info); if (cur.n_embd == -1) { - return result; + result.n_embd = -1; + break; } - if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { - fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); - return result; + if (result.n_embd != -1 && result.n_embd != cur.n_embd) { + fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str()); + result.n_embd = -1; + break; } if (result.n_embd == -1) { result = std::move(cur); } else { + result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary for (size_t i = 0; i < cur.data.size(); i++) { result.data[i] += cur.data[i]; } @@ -3239,7 +3007,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont } if (result.n_embd == -1) { - fprintf(stderr, "%s: no vectors passed\n", __func__); + fprintf(stderr, "%s: no valid control vector files passed\n", __func__); + result.data.clear(); } return result; @@ -3407,7 +3176,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } - fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); |