diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-07-27 07:55:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-27 07:55:01 +0200 |
commit | 154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch) | |
tree | 81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /common/common.h | |
parent | 0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff) |
Merge mainline llama.cpp (#3)
* Merging mainline - WIP
* Merging mainline - WIP
AVX2 and CUDA appear to work.
CUDA performance seems slightly (~1-2%) lower as it is so often
the case with llama.cpp/ggml after some "improvements" have been made.
* Merging mainline - fix Metal
* Remove check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'common/common.h')
-rw-r--r-- | common/common.h | 81 |
1 files changed, 57 insertions, 24 deletions
diff --git a/common/common.h b/common/common.h index bb45b3b4..979762e1 100644 --- a/common/common.h +++ b/common/common.h @@ -52,6 +52,12 @@ int32_t cpu_get_num_math(); // CLI argument parsing // +// dimensionality reduction methods, used by cvector-generator +enum dimre_method { + DIMRE_METHOD_PCA, + DIMRE_METHOD_MEAN, +}; + struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed @@ -93,6 +99,7 @@ struct gpt_params { enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings + enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings // // sampling parameters struct llama_sampling_params sparams; @@ -101,6 +108,7 @@ struct gpt_params { std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias std::string model_url = ""; // model url to download + std::string hf_token = ""; // HF token std::string hf_repo = ""; // HF repo std::string hf_file = ""; // HF file std::string prompt = ""; @@ -120,7 +128,6 @@ struct gpt_params { // TODO: avoid tuple, use struct std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale - std::string lora_base = ""; // base model path for the lora adapter std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale @@ -152,7 +159,6 @@ struct gpt_params { bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it - bool embedding = false; // get only sentence embedding bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\" bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles @@ -179,6 +185,12 @@ struct gpt_params { std::string mmproj = ""; // path to multimodal projector std::vector<std::string> image; // path to image file(s) + // embedding + bool embedding = false; // get only sentence embedding + int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix + std::string embd_sep = "\n"; // separator of embendings + // server params int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds @@ -189,6 +201,7 @@ struct gpt_params { std::string public_path = ""; std::string chat_template = ""; std::string system_prompt = ""; + bool enable_chat_template = true; std::vector<std::string> api_keys; @@ -234,15 +247,19 @@ struct gpt_params { bool compute_ppl = true; // whether to compute perplexity // cvector-generator params - int n_completions = 64; - int n_pca_batch = 20; + int n_pca_batch = 100; int n_pca_iterations = 1000; - std::string cvector_outfile = "control_vector.gguf"; - std::string cvector_completions_file = "examples/cvector-generator/completions.txt"; - std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; - std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; + dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; + std::string cvector_outfile = "control_vector.gguf"; + std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; + std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; + + bool spm_infill = false; // suffix/prefix/middle pattern for infill + + std::string lora_outfile = "ggml-lora-merged-f16.gguf"; }; +void gpt_params_handle_hf_token(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params); bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); @@ -298,8 +315,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); -struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params); -struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params); +struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); // Batch utils @@ -337,21 +354,13 @@ std::string llama_token_to_piece( llama_token token, bool special = true); -// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function -// that takes into account the tokenizer type and decides how to handle the leading space -// -// detokenizes a vector of tokens into a string -// should work similar to Python's `tokenizer.decode` -// removes the leading space from the first non-BOS token -std::string llama_detokenize_spm( - llama_context * ctx, - const std::vector<llama_token> & tokens); - // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` -std::string llama_detokenize_bpe( +// optionally renders special/control tokens +std::string llama_detokenize( llama_context * ctx, - const std::vector<llama_token> & tokens); + const std::vector<llama_token> & tokens, + bool special = true); // Uses the value from the model metadata if possible, otherwise // defaults to true when model type is SPM, otherwise false. @@ -361,9 +370,34 @@ bool llama_should_add_bos_token(const llama_model * model); // Chat template utils // +// same with llama_chat_message, but uses std::string +struct llama_chat_msg { + std::string role; + std::string content; +}; + // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool llama_chat_verify_template(const std::string & tmpl); +// CPP wrapper for llama_chat_apply_template +// If the built-in template is not supported, we default to chatml +// If the custom "tmpl" is not supported, we throw an error +std::string llama_chat_apply_template(const struct llama_model * model, + const std::string & tmpl, + const std::vector<llama_chat_msg> & chat, + bool add_ass); + +// Format single message, while taking into account the position of that message in chat history +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector<llama_chat_msg> & past_msg, + const llama_chat_msg & new_msg, + bool add_ass); + +// Returns an example of formatted chat +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl); + // // KV cache utils // @@ -378,7 +412,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n); +void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); @@ -422,4 +456,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha void yaml_dump_non_result_info( FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc); - |