summaryrefslogtreecommitdiff
path: root/common/common.h
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-07-27 07:55:01 +0200
committerGitHub <noreply@github.com>2024-07-27 07:55:01 +0200
commit154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch)
tree81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /common/common.h
parent0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff)
Merge mainline llama.cpp (#3)
* Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'common/common.h')
-rw-r--r--common/common.h81
1 files changed, 57 insertions, 24 deletions
diff --git a/common/common.h b/common/common.h
index bb45b3b4..979762e1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
// CLI argument parsing
//
+// dimensionality reduction methods, used by cvector-generator
+enum dimre_method {
+ DIMRE_METHOD_PCA,
+ DIMRE_METHOD_MEAN,
+};
+
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
@@ -93,6 +99,7 @@ struct gpt_params {
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
@@ -101,6 +108,7 @@ struct gpt_params {
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
+ std::string hf_token = ""; // HF token
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string prompt = "";
@@ -120,7 +128,6 @@ struct gpt_params {
// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
- std::string lora_base = ""; // base model path for the lora adapter
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
@@ -152,7 +159,6 @@ struct gpt_params {
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
- bool embedding = false; // get only sentence embedding
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
@@ -179,6 +185,12 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
+ // embedding
+ bool embedding = false; // get only sentence embedding
+ int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
+ std::string embd_sep = "\n"; // separator of embendings
+
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
@@ -189,6 +201,7 @@ struct gpt_params {
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
+ bool enable_chat_template = true;
std::vector<std::string> api_keys;
@@ -234,15 +247,19 @@ struct gpt_params {
bool compute_ppl = true; // whether to compute perplexity
// cvector-generator params
- int n_completions = 64;
- int n_pca_batch = 20;
+ int n_pca_batch = 100;
int n_pca_iterations = 1000;
- std::string cvector_outfile = "control_vector.gguf";
- std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
+ std::string cvector_outfile = "control_vector.gguf";
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
+
+ std::string lora_outfile = "ggml-lora-merged-f16.gguf";
};
+void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
@@ -298,8 +315,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
// Batch utils
@@ -337,21 +354,13 @@ std::string llama_token_to_piece(
llama_token token,
bool special = true);
-// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
-// that takes into account the tokenizer type and decides how to handle the leading space
-//
-// detokenizes a vector of tokens into a string
-// should work similar to Python's `tokenizer.decode`
-// removes the leading space from the first non-BOS token
-std::string llama_detokenize_spm(
- llama_context * ctx,
- const std::vector<llama_token> & tokens);
-
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
-std::string llama_detokenize_bpe(
+// optionally renders special/control tokens
+std::string llama_detokenize(
llama_context * ctx,
- const std::vector<llama_token> & tokens);
+ const std::vector<llama_token> & tokens,
+ bool special = true);
// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
@@ -361,9 +370,34 @@ bool llama_should_add_bos_token(const llama_model * model);
// Chat template utils
//
+// same with llama_chat_message, but uses std::string
+struct llama_chat_msg {
+ std::string role;
+ std::string content;
+};
+
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);
+// CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
+std::string llama_chat_apply_template(const struct llama_model * model,
+ const std::string & tmpl,
+ const std::vector<llama_chat_msg> & chat,
+ bool add_ass);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string llama_chat_format_single(const struct llama_model * model,
+ const std::string & tmpl,
+ const std::vector<llama_chat_msg> & past_msg,
+ const llama_chat_msg & new_msg,
+ bool add_ass);
+
+// Returns an example of formatted chat
+std::string llama_chat_format_example(const struct llama_model * model,
+ const std::string & tmpl);
+
//
// KV cache utils
//
@@ -378,7 +412,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils
//
-void llama_embd_normalize(const float * inp, float * out, int n);
+void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@@ -422,4 +456,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
-