summaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
Diffstat (limited to 'common')
-rw-r--r--common/common.cpp100
-rw-r--r--common/common.h27
2 files changed, 95 insertions, 32 deletions
diff --git a/common/common.cpp b/common/common.cpp
index 4d1d88c6..3b45d066 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -684,14 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--lora") {
CHECK_ARG
- params.lora_adapter.emplace_back(argv[i], 1.0f);
+ params.lora_adapters.push_back({
+ std::string(argv[i]),
+ 1.0,
+ });
return true;
}
if (arg == "--lora-scaled") {
CHECK_ARG
- const char* lora_adapter = argv[i];
+ std::string lora_adapter = argv[i];
CHECK_ARG
- params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
+ params.lora_adapters.push_back({
+ lora_adapter,
+ std::stof(argv[i]),
+ });
+ return true;
+ }
+ if (arg == "--lora-init-without-apply") {
+ params.lora_init_without_apply = true;
return true;
}
if (arg == "--control-vector") {
@@ -1332,6 +1342,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
+ if (arg == "--no-warmup") {
+ params.warmup = false;
+ return true;
+ }
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@@ -1454,6 +1468,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
+ options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
options.push_back({ "server infill",
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
@@ -1637,7 +1652,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
- options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+ options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
@@ -1657,6 +1672,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
+ options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
#ifndef LOG_DISABLE_LOGS
options.push_back({ "logging" });
@@ -1769,6 +1785,17 @@ std::string string_get_sortable_timestamp() {
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
}
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
+ if (search.empty()) {
+ return; // Avoid infinite loop if 'search' is an empty string
+ }
+ size_t pos = 0;
+ while ((pos = s.find(search, pos)) != std::string::npos) {
+ s.replace(pos, search.length(), replace);
+ pos += replace.length();
+ }
+}
+
void string_process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
@@ -2042,8 +2069,8 @@ std::string fs_get_cache_file(const std::string & filename) {
//
// Model utils
//
-
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
+struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
+ llama_init_result iparams;
auto mparams = llama_model_params_from_gpt_params(params);
llama_model * model = nullptr;
@@ -2058,7 +2085,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
- return std::make_tuple(nullptr, nullptr);
+ return iparams;
}
auto cparams = llama_context_params_from_gpt_params(params);
@@ -2067,7 +2094,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
+ return iparams;
}
if (!params.control_vectors.empty()) {
@@ -2078,7 +2105,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (cvec.n_embd == -1) {
llama_free(lctx);
llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
+ return iparams;
}
int err = llama_control_vector_apply(lctx,
@@ -2090,21 +2117,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (err) {
llama_free(lctx);
llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
+ return iparams;
}
}
- for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
- const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
- float lora_scale = std::get<1>(params.lora_adapter[i]);
- auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
- if (adapter == nullptr) {
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+ // load and optionally apply lora adapters
+ for (auto & la : params.lora_adapters) {
+ llama_lora_adapter_container loaded_la;
+ loaded_la.path = la.path;
+ loaded_la.scale = la.scale;
+ loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+ if (loaded_la.adapter == nullptr) {
+ fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
+ return iparams;
}
- llama_lora_adapter_set(lctx, adapter, lora_scale);
+ iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+ }
+ if (!params.lora_init_without_apply) {
+ llama_lora_adapters_apply(lctx, iparams.lora_adapters);
}
if (params.ignore_eos) {
@@ -2132,13 +2164,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
tmp.clear();
tmp.push_back(decoder_start_token_id);
}
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+ if (llama_model_has_decoder(model)) {
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+ }
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
llama_reset_timings(lctx);
}
- return std::make_tuple(model, lctx);
+ iparams.model = model;
+ iparams.context = lctx;
+ return iparams;
+}
+
+void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
+ llama_lora_adapter_clear(ctx);
+ for (auto & la : lora_adapters) {
+ if (la.scale != 0.0f) {
+ llama_lora_adapter_set(ctx, la.adapter, la.scale);
+ }
+ }
}
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
@@ -3163,19 +3208,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, "lora:\n");
- for (std::tuple<std::string, float> la : params.lora_adapter) {
- if (std::get<1>(la) != 1.0f) {
- continue;
+ for (auto & la : params.lora_adapters) {
+ if (la.scale == 1.0f) {
+ fprintf(stream, " - %s\n", la.path.c_str());
}
- fprintf(stream, " - %s\n", std::get<0>(la).c_str());
}
fprintf(stream, "lora_scaled:\n");
- for (std::tuple<std::string, float> la : params.lora_adapter) {
- if (std::get<1>(la) == 1.0f) {
- continue;
+ for (auto & la : params.lora_adapters) {
+ if (la.scale != 1.0f) {
+ fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
}
- fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
+ fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
diff --git a/common/common.h b/common/common.h
index 979762e1..50035897 100644
--- a/common/common.h
+++ b/common/common.h
@@ -33,6 +33,15 @@
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+struct llama_lora_adapter_info {
+ std::string path;
+ float scale;
+};
+
+struct llama_lora_adapter_container : llama_lora_adapter_info {
+ struct llama_lora_adapter * adapter;
+};
+
// build info
extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT;
@@ -126,8 +135,8 @@ struct gpt_params {
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
- // TODO: avoid tuple, use struct
- std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
+ std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
@@ -278,6 +287,8 @@ std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
+
template<class T>
static std::vector<T> string_split(const std::string & str, char delim) {
std::vector<T> values;
@@ -309,8 +320,13 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils
//
-// TODO: avoid tuplue, use struct
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
+struct llama_init_result {
+ struct llama_model * model = nullptr;
+ struct llama_context * context = nullptr;
+ std::vector<llama_lora_adapter_container> lora_adapters;
+};
+
+struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
@@ -318,6 +334,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+// clear LoRA adapters from context, then apply new list of adapters
+void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
+
// Batch utils
void llama_batch_clear(struct llama_batch & batch);