summaryrefslogtreecommitdiff
path: root/common/common.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'common/common.cpp')
-rw-r--r--common/common.cpp1190
1 files changed, 479 insertions, 711 deletions
diff --git a/common/common.cpp b/common/common.cpp
index 8eb23ade..4d1d88c6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
#include "common.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
@@ -190,6 +194,12 @@ int32_t cpu_get_num_math() {
// CLI argument parsing
//
+void gpt_params_handle_hf_token(gpt_params & params) {
+ if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
+ params.hf_token = std::getenv("HF_TOKEN");
+ }
+}
+
void gpt_params_handle_model_default(gpt_params & params) {
if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
@@ -237,6 +247,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
gpt_params_handle_model_default(params);
+ gpt_params_handle_hf_token(params);
+
if (params.escape) {
string_process_escapes(params.prompt);
string_process_escapes(params.input_prefix);
@@ -273,26 +285,22 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return true;
}
+#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
+
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
const char split_delim = ',';
llama_sampling_params & sparams = params.sparams;
if (arg == "-s" || arg == "--seed") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
// TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
params.seed = std::stoul(argv[i]);
sparams.seed = std::stoul(argv[i]);
return true;
}
if (arg == "-t" || arg == "--threads") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
@@ -300,10 +308,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-tb" || arg == "--threads-batch") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads_batch = std::stoi(argv[i]);
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
@@ -311,10 +316,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-td" || arg == "--threads-draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads_draft = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency();
@@ -322,10 +324,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-tbd" || arg == "--threads-batch-draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads_batch_draft = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency();
@@ -333,10 +332,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-p" || arg == "--prompt") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.prompt = argv[i];
return true;
}
@@ -349,10 +345,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--prompt-cache") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.path_prompt_cache = argv[i];
return true;
}
@@ -365,10 +358,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-bf" || arg == "--binary-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i], std::ios::binary);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -384,10 +374,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-f" || arg == "--file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -403,10 +390,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--in-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -417,66 +401,42 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_predict = std::stoi(argv[i]);
return true;
}
if (arg == "--top-k") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.top_k = std::stoi(argv[i]);
return true;
}
if (arg == "-c" || arg == "--ctx-size") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_ctx = std::stoi(argv[i]);
return true;
}
if (arg == "--grp-attn-n" || arg == "-gan") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.grp_attn_n = std::stoi(argv[i]);
return true;
}
if (arg == "--grp-attn-w" || arg == "-gaw") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.grp_attn_w = std::stoi(argv[i]);
return true;
}
if (arg == "--rope-freq-base") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.rope_freq_base = std::stof(argv[i]);
return true;
}
if (arg == "--rope-freq-scale") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.rope_freq_scale = std::stof(argv[i]);
return true;
}
if (arg == "--rope-scaling") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
@@ -485,58 +445,37 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--rope-scale") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.rope_freq_scale = 1.0f / std::stof(argv[i]);
return true;
}
if (arg == "--yarn-orig-ctx") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_orig_ctx = std::stoi(argv[i]);
return true;
}
if (arg == "--yarn-ext-factor") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_ext_factor = std::stof(argv[i]);
return true;
}
if (arg == "--yarn-attn-factor") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_attn_factor = std::stof(argv[i]);
return true;
}
if (arg == "--yarn-beta-fast") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_beta_fast = std::stof(argv[i]);
return true;
}
if (arg == "--yarn-beta-slow") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.yarn_beta_slow = std::stof(argv[i]);
return true;
}
if (arg == "--pooling") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
@@ -545,158 +484,109 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
+ if (arg == "--attention") {
+ CHECK_ARG
+ std::string value(argv[i]);
+ /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+ else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+ else { invalid_param = true; }
+ return true;
+ }
if (arg == "--defrag-thold" || arg == "-dt") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.defrag_thold = std::stof(argv[i]);
return true;
}
if (arg == "--samplers") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
const auto sampler_names = string_split(argv[i], ';');
sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
return true;
}
if (arg == "--sampling-seq") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
return true;
}
if (arg == "--top-p") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.top_p = std::stof(argv[i]);
return true;
}
if (arg == "--min-p") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.min_p = std::stof(argv[i]);
return true;
}
if (arg == "--temp") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.temp = std::stof(argv[i]);
sparams.temp = std::max(sparams.temp, 0.0f);
return true;
}
if (arg == "--tfs") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.tfs_z = std::stof(argv[i]);
return true;
}
if (arg == "--typical") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.typical_p = std::stof(argv[i]);
return true;
}
if (arg == "--repeat-last-n") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.penalty_last_n = std::stoi(argv[i]);
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
return true;
}
if (arg == "--repeat-penalty") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.penalty_repeat = std::stof(argv[i]);
return true;
}
if (arg == "--frequency-penalty") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.penalty_freq = std::stof(argv[i]);
return true;
}
if (arg == "--presence-penalty") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.penalty_present = std::stof(argv[i]);
return true;
}
if (arg == "--dynatemp-range") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.dynatemp_range = std::stof(argv[i]);
return true;
}
if (arg == "--dynatemp-exp") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.dynatemp_exponent = std::stof(argv[i]);
return true;
}
if (arg == "--mirostat") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.mirostat = std::stoi(argv[i]);
return true;
}
if (arg == "--mirostat-lr") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.mirostat_eta = std::stof(argv[i]);
return true;
}
if (arg == "--mirostat-ent") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.mirostat_tau = std::stof(argv[i]);
return true;
}
if (arg == "--cfg-negative-prompt") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.cfg_negative_prompt = argv[i];
return true;
}
if (arg == "--cfg-negative-prompt-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -710,203 +600,126 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--cfg-scale") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.cfg_scale = std::stof(argv[i]);
return true;
}
if (arg == "-b" || arg == "--batch-size") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_batch = std::stoi(argv[i]);
return true;
}
if (arg == "-ub" || arg == "--ubatch-size") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_ubatch = std::stoi(argv[i]);
return true;
}
if (arg == "--keep") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_keep = std::stoi(argv[i]);
return true;
}
if (arg == "--draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_draft = std::stoi(argv[i]);
return true;
}
if (arg == "--chunks") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_chunks = std::stoi(argv[i]);
return true;
}
if (arg == "-np" || arg == "--parallel") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_parallel = std::stoi(argv[i]);
return true;
}
if (arg == "-ns" || arg == "--sequences") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_sequences = std::stoi(argv[i]);
return true;
}
if (arg == "--p-split" || arg == "-ps") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.p_split = std::stof(argv[i]);
return true;
}
if (arg == "-m" || arg == "--model") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.model = argv[i];
return true;
}
if (arg == "-md" || arg == "--model-draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.model_draft = argv[i];
return true;
}
if (arg == "-a" || arg == "--alias") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.model_alias = argv[i];
return true;
}
if (arg == "-mu" || arg == "--model-url") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.model_url = argv[i];
return true;
}
- if (arg == "-hfr" || arg == "--hf-repo") {
+ if (arg == "-hft" || arg == "--hf-token") {
if (++i >= argc) {
- invalid_param = true;
- return true;
+ invalid_param = true;
+ return true;
}
+ params.hf_token = argv[i];
+ return true;
+ }
+ if (arg == "-hfr" || arg == "--hf-repo") {
+ CHECK_ARG
params.hf_repo = argv[i];
return true;
}
if (arg == "-hff" || arg == "--hf-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.hf_file = argv[i];
return true;
}
if (arg == "--lora") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.lora_adapter.emplace_back(argv[i], 1.0f);
- params.use_mmap = false;
return true;
}
if (arg == "--lora-scaled") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
const char* lora_adapter = argv[i];
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
- params.use_mmap = false;
- return true;
- }
- if (arg == "--lora-base") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.lora_base = argv[i];
return true;
}
if (arg == "--control-vector") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.control_vectors.push_back({ 1.0f, argv[i], });
return true;
}
if (arg == "--control-vector-scaled") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
const char* fname = argv[i];
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
return true;
}
if (arg == "--control-vector-layer-range") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.control_vector_layer_start = std::stoi(argv[i]);
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.control_vector_layer_end = std::stoi(argv[i]);
return true;
}
if (arg == "--mmproj") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.mmproj = argv[i];
return true;
}
if (arg == "--image") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.image.emplace_back(argv[i]);
return true;
}
@@ -922,6 +735,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.embedding = true;
return true;
}
+ if (arg == "--embd-normalize") {
+ CHECK_ARG
+ params.embd_normalize = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--embd-output-format") {
+ CHECK_ARG
+ params.embd_out = argv[i];
+ return true;
+ }
+ if (arg == "--embd-separator") {
+ CHECK_ARG
+ params.embd_sep = argv[i];
+ return true;
+ }
if (arg == "-if" || arg == "--interactive-first") {
params.interactive_first = true;
return true;
@@ -950,7 +778,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cache_type_v = argv[++i];
return true;
}
- if (arg == "--multiline-input") {
+ if (arg == "-mli" || arg == "--multiline-input") {
params.multiline_input = true;
return true;
}
@@ -962,6 +790,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cont_batching = true;
return true;
}
+ if (arg == "-nocb" || arg == "--no-cont-batching") {
+ params.cont_batching = false;
+ return true;
+ }
if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true;
return true;
@@ -975,10 +807,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_gpu_layers = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
@@ -987,10 +816,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
@@ -999,10 +825,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--main-gpu" || arg == "-mg") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUDA_SYCL_VULKAN
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
@@ -1010,10 +833,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--split-mode" || arg == "-sm") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string arg_next = argv[i];
if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
@@ -1038,10 +858,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--tensor-split" || arg == "-ts") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string arg_next = argv[i];
// split string by , and /
@@ -1066,10 +883,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--rpc") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.rpc_servers = argv[i];
return true;
}
@@ -1078,10 +892,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--numa") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -1094,10 +905,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--verbosity") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.verbosity = std::stoi(argv[i]);
return true;
}
@@ -1110,18 +918,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-r" || arg == "--reverse-prompt") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.antiprompt.emplace_back(argv[i]);
return true;
}
if (arg == "-ld" || arg == "--logdir") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.logdir = argv[i];
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
@@ -1130,26 +932,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-lcs" || arg == "--lookup-cache-static") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.lookup_cache_static = argv[i];
return true;
}
if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.lookup_cache_dynamic = argv[i];
return true;
}
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.logits_file = argv[i];
return true;
}
@@ -1158,26 +951,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--ppl-stride") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.ppl_stride = std::stoi(argv[i]);
return true;
}
if (arg == "--ppl-output-type") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.ppl_output_type = std::stoi(argv[i]);
return true;
}
if (arg == "-ptc" || arg == "--print-token-count") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_print = std::stoi(argv[i]);
return true;
}
@@ -1190,10 +974,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--hellaswag-tasks") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.hellaswag_tasks = std::stoi(argv[i]);
return true;
}
@@ -1202,10 +983,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--winogrande-tasks") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.winogrande_tasks = std::stoi(argv[i]);
return true;
}
@@ -1214,10 +992,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--multiple-choice-tasks") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.multiple_choice_tasks = std::stoi(argv[i]);
return true;
}
@@ -1234,10 +1009,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-l" || arg == "--logit-bias") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::stringstream ss(argv[i]);
llama_token key;
char sign;
@@ -1267,37 +1039,32 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
+ params.enable_chat_template = false;
return true;
}
if (arg == "--in-prefix") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.input_prefix = argv[i];
+ params.enable_chat_template = false;
return true;
}
if (arg == "--in-suffix") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.input_suffix = argv[i];
+ params.enable_chat_template = false;
+ return true;
+ }
+ if (arg == "--spm-infill") {
+ params.spm_infill = true;
return true;
}
if (arg == "--grammar") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.grammar = argv[i];
return true;
}
if (arg == "--grammar-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -1312,18 +1079,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-j" || arg == "--json-schema") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
return true;
}
if (arg == "--override-kv") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
@@ -1332,42 +1093,27 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--host") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.hostname = argv[i];
return true;
}
if (arg == "--port") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.port = std::stoi(argv[i]);
return true;
}
if (arg == "--path") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.public_path = argv[i];
return true;
}
if (arg == "--api-key") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.api_keys.push_back(argv[i]);
return true;
}
if (arg == "--api-key-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream key_file(argv[i]);
if (!key_file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -1384,43 +1130,28 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--ssl-key-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.ssl_file_key = argv[i];
return true;
}
if (arg == "--ssl-cert-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.ssl_file_cert = argv[i];
return true;
}
if (arg == "--timeout" || arg == "-to") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.timeout_read = std::stoi(argv[i]);
params.timeout_write = std::stoi(argv[i]);
return true;
}
if (arg == "--threads-http") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_threads_http = std::stoi(argv[i]);
return true;
}
if (arg == "-spf" || arg == "--system-prompt-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -1437,10 +1168,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--log-format") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
if (std::strcmp(argv[i], "json") == 0) {
params.log_json = true;
} else if (std::strcmp(argv[i], "text") == 0) {
@@ -1460,10 +1188,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--slot-save-path") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.slot_save_path = argv[i];
// if doesn't end with DIRECTORY_SEPARATOR, add it
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
@@ -1472,10 +1197,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--chat-template") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
if (!llama_chat_verify_template(argv[i])) {
fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
@@ -1486,10 +1208,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.slot_prompt_similarity = std::stof(argv[i]);
return true;
}
@@ -1498,37 +1217,25 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-npp") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
auto p = string_split<int>(argv[i], split_delim);
params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
return true;
}
if (arg == "-ntg") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
auto p = string_split<int>(argv[i], split_delim);
params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
return true;
}
if (arg == "-npl") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
auto p = string_split<int>(argv[i], split_delim);
params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
return true;
}
if (arg == "--context-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
std::ifstream file(argv[i], std::ios::binary);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
@@ -1539,59 +1246,39 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--chunk-size") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.chunk_size = std::stoi(argv[i]);
return true;
}
if (arg == "--chunk-separator") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.chunk_separator = argv[i];
return true;
}
if (arg == "--junk") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_junk = std::stoi(argv[i]);
return true;
}
if (arg == "--pos") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.i_pos = std::stoi(argv[i]);
return true;
}
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.out_file = argv[i];
params.cvector_outfile = argv[i];
+ params.lora_outfile = argv[i];
return true;
}
if (arg == "-ofreq" || arg == "--output-frequency") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_out_freq = std::stoi(argv[i]);
return true;
}
if (arg == "--save-frequency") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_save_freq = std::stoi(argv[i]);
return true;
}
@@ -1612,62 +1299,39 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "--chunk" || arg == "--from-chunk") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.i_chunk = std::stoi(argv[i]);
return true;
}
// cvector params
- if (arg == "--completions-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.cvector_completions_file = argv[i];
- return true;
- }
if (arg == "--positive-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.cvector_positive_file = argv[i];
return true;
}
if (arg == "--negative-file") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.cvector_negative_file = argv[i];
return true;
}
- if (arg == "--completions") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.n_completions = std::stoi(argv[i]);
- return true;
- }
if (arg == "--pca-batch") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_pca_batch = std::stoi(argv[i]);
return true;
}
if (arg == "--pca-iter") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
params.n_pca_iterations = std::stoi(argv[i]);
return true;
}
+ if (arg == "--method") {
+ CHECK_ARG
+ std::string value(argv[i]);
+ /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
+ else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+ else { invalid_param = true; }
+ return true;
+ }
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@@ -1679,10 +1343,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
// We have a matching known parameter requiring an argument,
// now we need to check if there is anything after this argv
// and flag invalid_param or parse it.
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
+ CHECK_ARG
if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
invalid_param = true;
return true;
@@ -1767,7 +1428,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
- options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
+ options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
+ "in conversation mode, this will be used as system prompt\n"
+ "(default: '%s')", params.prompt.c_str() });
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
@@ -1782,13 +1445,17 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"halt generation at PROMPT, return control in interactive mode\n"
"can be specified more than once for multiple prompts" });
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
- options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+ options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
+ "if suffix/prefix are not specified, default chat template will be used\n"
+ "(default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
+ options.push_back({ "server infill",
+ " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
options.push_back({ "sampling" });
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
@@ -1822,7 +1489,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
"negative prompt file to use for guidance" });
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
-
+ options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
+ "if suffix/prefix are specified, template will be disabled\n"
+ "only commonly used templates are accepted:\n"
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "grammar" });
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
@@ -1831,8 +1502,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
options.push_back({ "embedding" });
- options.push_back({ "embedding", " --pooling {none,mean,cls}",
+ options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
"pooling type for embeddings, use model default if unspecified" });
+ options.push_back({ "embedding", " --attention {causal,non-causal}",
+ "attention type for embeddings, use model default if unspecified" });
options.push_back({ "context hacking" });
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
@@ -1871,6 +1544,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
+ options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
@@ -1913,12 +1587,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
"advanced option to override model metadata by key. may be specified multiple times.\n"
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
- options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
- options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
- options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
- options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
+ options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
+ options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
+ options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
+ "note: this argument can be repeated to add multiple control vectors" });
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
- "add a control vector with user defined scaling SCALE" });
+ "add a control vector with user defined scaling SCALE\n"
+ "note: this argument can be repeated to add multiple scaled control vectors" });
options.push_back({ "*", " --control-vector-layer-range START END",
"layer range to apply the control vector(s) to, start and end inclusive" });
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
@@ -1927,6 +1602,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
+ options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
options.push_back({ "retrieval" });
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
@@ -1952,6 +1628,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
+ options.push_back({ "embedding" });
+ options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
+ options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
+ options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" });
+
options.push_back({ "server" });
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
@@ -1994,11 +1675,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
- options.push_back({ "cvector", " --completions-file FNAME",
- "completions file (default: '%s')", params.cvector_completions_file.c_str() });
- options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
- options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
- options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
+ options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
+ options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
+ options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
+
+ options.push_back({ "export-lora" });
+ options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
+ options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
+ options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
+ options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
@@ -2363,9 +2049,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) {
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+ model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
@@ -2411,19 +2097,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
- int err = llama_model_apply_lora_from_file(model,
- lora_adapter.c_str(),
- lora_scale,
- ((i > 0) || params.lora_base.empty())
- ? NULL
- : params.lora_base.c_str(),
- params.n_threads);
- if (err != 0) {
+ auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+ if (adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
}
if (params.ignore_eos) {
@@ -2433,7 +2114,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (params.warmup) {
LOG("warming up the model with an empty run\n");
- std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
+ std::vector<llama_token> tmp;
+ llama_token bos = llama_token_bos(model);
+ llama_token eos = llama_token_eos(model);
+ // some models (e.g. T5) don't have a BOS token
+ if (bos != -1) {
+ tmp.push_back(bos);
+ }
+ tmp.push_back(eos);
+
+ if (llama_model_has_encoder(model)) {
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+ llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+ if (decoder_start_token_id == -1) {
+ decoder_start_token_id = bos;
+ }
+ tmp.clear();
+ tmp.push_back(decoder_start_token_id);
+ }
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
@@ -2516,6 +2214,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
+ cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
@@ -2535,7 +2234,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
return str.rfind(prefix, 0) == 0;
}
-static bool llama_download_file(const std::string & url, const std::string & path) {
+static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@@ -2550,6 +2249,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+ // Check if hf-token or bearer-token was specified
+ if (!hf_token.empty()) {
+ std::string auth_header = "Authorization: Bearer ";
+ auth_header += hf_token.c_str();
+ struct curl_slist *http_headers = NULL;
+ http_headers = curl_slist_append(http_headers, auth_header.c_str());
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
+ }
+
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
@@ -2745,6 +2453,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
struct llama_model * llama_load_model_from_url(
const char * model_url,
const char * path_model,
+ const char * hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
@@ -2752,7 +2461,7 @@ struct llama_model * llama_load_model_from_url(
return NULL;
}
- if (!llama_download_file(model_url, path_model)) {
+ if (!llama_download_file(model_url, path_model, hf_token)) {
return NULL;
}
@@ -2800,14 +2509,14 @@ struct llama_model * llama_load_model_from_url(
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (int idx = 1; idx < n_split; idx++) {
- futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
- return llama_download_file(split_url, split_path);
+ return llama_download_file(split_url, split_path, hf_token);
}, idx));
}
@@ -2826,6 +2535,7 @@ struct llama_model * llama_load_model_from_hf(
const char * repo,
const char * model,
const char * path_model,
+ const char * hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
@@ -2841,7 +2551,7 @@ struct llama_model * llama_load_model_from_hf(
model_url += "/resolve/main/";
model_url += model;
- return llama_load_model_from_url(model_url.c_str(), path_model, params);
+ return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
}
#else
@@ -2849,6 +2559,7 @@ struct llama_model * llama_load_model_from_hf(
struct llama_model * llama_load_model_from_url(
const char * /*model_url*/,
const char * /*path_model*/,
+ const char * /*hf_token*/,
const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
@@ -2858,6 +2569,7 @@ struct llama_model * llama_load_model_from_hf(
const char * /*repo*/,
const char * /*model*/,
const char * /*path_model*/,
+ const char * /*hf_token*/,
const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
@@ -2922,51 +2634,35 @@ std::vector<llama_token> llama_tokenize(
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
- std::vector<char> result(8, 0);
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
- if (n_tokens < 0) {
- result.resize(-n_tokens);
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
- GGML_ASSERT(check == -n_tokens);
- } else {
- result.resize(n_tokens);
- }
-
- return std::string(result.data(), result.size());
-}
-
-std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
- const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
-
std::string piece;
- std::string result;
-
- for (size_t i = 0; i < tokens.size(); ++i) {
- piece = llama_token_to_piece(ctx, tokens[i]);
-
- // remove the leading space of the first non-BOS token
- if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
- piece = piece.substr(1);
- }
-
- result += piece;
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
+ const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+ if (n_chars < 0) {
+ piece.resize(-n_chars);
+ int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+ GGML_ASSERT(check == -n_chars);
+ }
+ else {
+ piece.resize(n_chars);
}
- return result;
+ return piece;
}
-std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
- std::string piece;
- std::string result;
-
- for (size_t i = 0; i < tokens.size(); ++i) {
- piece = llama_token_to_piece(ctx, tokens[i]);
-
- result += piece;
+std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+ std::string text;
+ text.resize(std::max(text.capacity(), tokens.size()));
+ int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+ if (n_chars < 0) {
+ text.resize(-n_chars);
+ n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+ GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}
+ text.resize(n_chars);
+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
- return result;
+ return text;
}
bool llama_should_add_bos_token(const llama_model * model) {
@@ -2975,12 +2671,91 @@ bool llama_should_add_bos_token(const llama_model * model) {
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
}
+//
+// Chat template utils
+//
+
bool llama_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
+std::string llama_chat_apply_template(const struct llama_model * model,
+ const std::string & tmpl,
+ const std::vector<llama_chat_msg> & msgs,
+ bool add_ass) {
+ int alloc_size = 0;
+ bool fallback = false; // indicate if we must fallback to default chatml
+ std::vector<llama_chat_message> chat;
+ for (auto & msg : msgs) {
+ chat.push_back({msg.role.c_str(), msg.content.c_str()});
+ alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
+ }
+
+ const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+ std::vector<char> buf(alloc_size);
+
+ // run the first time to get the total output length
+ int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+
+ // error: chat template is not supported
+ if (res < 0) {
+ if (ptr_tmpl != nullptr) {
+ // if the custom "tmpl" is not supported, we throw an error
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+ throw std::runtime_error("this custom template is not supported");
+ } else {
+ // If the built-in template is not supported, we default to chatml
+ res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+ fallback = true;
+ }
+ }
+
+ // if it turns out that our buffer is too small, we resize it
+ if ((size_t) res > buf.size()) {
+ buf.resize(res);
+ res = llama_chat_apply_template(
+ fallback ? nullptr : model,
+ fallback ? "chatml" : ptr_tmpl,
+ chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+ }
+
+ std::string formatted_chat(buf.data(), res);
+ return formatted_chat;
+}
+
+std::string llama_chat_format_single(const struct llama_model * model,
+ const std::string & tmpl,
+ const std::vector<llama_chat_msg> & past_msg,
+ const llama_chat_msg & new_msg,
+ bool add_ass) {
+ std::ostringstream ss;
+ auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
+ std::vector<llama_chat_msg> chat_new(past_msg);
+ // if the past_msg ends with a newline, we must preserve it in the formatted version
+ if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
+ ss << "\n";
+ };
+ // format chat with new_msg
+ chat_new.push_back(new_msg);
+ auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
+ // get the diff part
+ ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+ return ss.str();
+}
+
+std::string llama_chat_format_example(const struct llama_model * model,
+ const std::string & tmpl) {
+ std::vector<llama_chat_msg> msgs = {
+ {"system", "You are a helpful assistant"},
+ {"user", "Hello"},
+ {"assistant", "Hi there"},
+ {"user", "How are you?"},
+ };
+ return llama_chat_apply_template(model, tmpl, msgs, true);
+}
+
//
// KV cache utils
//
@@ -3060,14 +2835,34 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils
//
-void llama_embd_normalize(const float * inp, float * out, int n) {
+void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
double sum = 0.0;
- for (int i = 0; i < n; i++) {
- sum += inp[i] * inp[i];
+
+ switch (embd_norm) {
+ case -1: // no normalisation
+ sum = 1.0;
+ break;
+ case 0: // max absolute
+ for (int i = 0; i < n; i++) {
+ if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
+ }
+ sum /= 32760.0; // make an int16 range
+ break;
+ case 2: // euclidean
+ for (int i = 0; i < n; i++) {
+ sum += inp[i] * inp[i];
+ }
+ sum = std::sqrt(sum);
+ break;
+ default: // p-norm (euclidean is p-norm p=2)
+ for (int i = 0; i < n; i++) {
+ sum += std::pow(std::abs(inp[i]), embd_norm);
+ }
+ sum = std::pow(sum, 1.0 / embd_norm);
+ break;
}
- sum = sqrt(sum);
- const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
+ const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
for (int i = 0; i < n; i++) {
out[i] = inp[i] * norm;
@@ -3085,6 +2880,14 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
sum2 += embd2[i] * embd2[i];
}
+ // Handle the case where one or both vectors are zero vectors
+ if (sum1 == 0.0 || sum2 == 0.0) {
+ if (sum1 == 0.0 && sum2 == 0.0) {
+ return 1.0f; // two zero vectors are similar
+ }
+ return 0.0f;
+ }
+
return sum / (sqrt(sum1) * sqrt(sum2));
}
@@ -3093,125 +2896,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
//
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
- int32_t n_tensors;
-
- size_t n_bytes = 0;
-
- uint32_t max_direction_layer = 0;
-
llama_control_vector_data result = { -1, {} };
- // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
- {
- struct ggml_init_params meta_params = {
- /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
- /* .mem_buffer = */ nullptr,
- /* .no_alloc = */ true,
- };
- ggml_context * meta_ctx = ggml_init(meta_params);
- struct gguf_init_params meta_gguf_params = {
- /* .no_alloc = */ true,
- /* .ctx = */ &meta_ctx,
- };
- struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
- if (!meta_ctx_gguf) {
- fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- return result;
- }
-
- n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
- for (int i = 0; i < n_tensors; i++) {
- std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
-
- // split on '.'
- size_t dotpos = name.find('.');
- if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
- try {
- uint32_t layer = std::stoi(name.substr(dotpos + 1));
- if (layer == 0) {
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
- return result;
- }
- if (layer > max_direction_layer) {
- max_direction_layer = layer;
- }
- } catch (...) {
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
- return result;
- }
- }
-
- struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
- if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
- fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
- return result;
- }
- if (result.n_embd == -1) {
- result.n_embd = ggml_nelements(tensor_meta);
- } else if (ggml_nelements(tensor_meta) != result.n_embd) {
- fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
- return result;
- }
- n_bytes += ggml_nbytes(tensor_meta);
- }
- ggml_free(meta_ctx);
- gguf_free(meta_ctx_gguf);
+ ggml_context * ctx = nullptr;
+ struct gguf_init_params meta_gguf_params = {
+ /* .no_alloc = */ false,
+ /* .ctx = */ &ctx,
+ };
+ struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+ if (!ctx_gguf) {
+ fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+ return result;
}
+ int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
if (n_tensors == 0) {
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
- return result;
}
- // load and scale tensors into final control vector context
- struct ggml_init_params ggml_params = {
- /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
- /* .mem_buffer = */ nullptr,
- /* .no_alloc = */ false,
- };
- struct ggml_context * ctx = ggml_init(ggml_params);
+ for (int i = 0; i < n_tensors; i++) {
+ std::string name = gguf_get_tensor_name(ctx_gguf, i);
- struct gguf_init_params params = {
- /*.no_alloc = */ false,
- /*.ctx = */ &ctx,
- };
- struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
- if (!ctx_gguf) {
- fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
- ggml_free(ctx);
- return result;
- }
+ int layer_idx = -1;
+
+ // split on '.'
+ size_t dotpos = name.find('.');
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+ try {
+ layer_idx = std::stoi(name.substr(dotpos + 1));
+ } catch (...) {
+ layer_idx = -1;
+ }
+ }
+ if (layer_idx < 0) {
+ fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ } else if (layer_idx == 0) {
+ fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ }
- // do not store data for layer 0 (it's not used)
- result.data.resize(result.n_embd * max_direction_layer);
+ struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+ if (tensor->type != GGML_TYPE_F32) {
+ fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ }
+ if (ggml_n_dims(tensor) != 1) {
+ fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ }
- for (uint32_t il = 1; il <= max_direction_layer; il++) {
- const std::string name = "direction." + std::to_string(il);
- const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+ if (result.n_embd == -1) {
+ result.n_embd = ggml_nelements(tensor);
+ } else if (ggml_nelements(tensor) != result.n_embd) {
+ fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+ result.n_embd = -1;
+ break;
+ }
- float * dst = result.data.data() + result.n_embd * (il - 1);
+ // extend if necessary - do not store data for layer 0 (it's not used)
+ result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
- if (tensor) {
- const float * src = (const float *) tensor->data;
- for (int j = 0; j < result.n_embd; j++) {
- dst[j] = src[j] * load_info.strength;
- }
- } else {
- for (int j = 0; j < result.n_embd; j++) {
- dst[j] = 0.0f;
- }
+ const float * src = (const float *) tensor->data;
+ float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
+ for (int j = 0; j < result.n_embd; j++) {
+ dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
}
+
}
+ if (result.n_embd == -1) {
+ fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+ result.data.clear();
+ }
+
+ gguf_free(ctx_gguf);
+ ggml_free(ctx);
+
return result;
}
@@ -3222,16 +2987,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
auto cur = llama_control_vector_load_one(info);
if (cur.n_embd == -1) {
- return result;
+ result.n_embd = -1;
+ break;
}
- if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
- fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
- return result;
+ if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
+ fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+ result.n_embd = -1;
+ break;
}
if (result.n_embd == -1) {
result = std::move(cur);
} else {
+ result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
for (size_t i = 0; i < cur.data.size(); i++) {
result.data[i] += cur.data[i];
}
@@ -3239,7 +3007,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
}
if (result.n_embd == -1) {
- fprintf(stderr, "%s: no vectors passed\n", __func__);
+ fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
+ result.data.clear();
}
return result;
@@ -3407,7 +3176,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
- fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);