diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 1766 |
1 files changed, 1343 insertions, 423 deletions
@@ -72,6 +72,7 @@ #include <numeric> #include <queue> #include <random> +#include <regex> #include <sstream> #include <thread> #include <unordered_map> @@ -80,20 +81,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -// tensor names -#define TN_TOKEN_EMBD "token_embd.weight" -#define TN_OUTPUT_NORM "output_norm.weight" -#define TN_OUTPUT "output.weight" -#define TN_ATTN_NORM "blk.%d.attn_norm.weight" -#define TN_ATTN_Q "blk.%d.attn_q.weight" -#define TN_ATTN_K "blk.%d.attn_k.weight" -#define TN_ATTN_V "blk.%d.attn_v.weight" -#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" -#define TN_FFN_NORM "blk.%d.ffn_norm.weight" -#define TN_FFN_GATE "blk.%d.ffn_gate.weight" -#define TN_FFN_DOWN "blk.%d.ffn_down.weight" -#define TN_FFN_UP "blk.%d.ffn_up.weight" - #ifdef __GNUC__ #ifdef __MINGW32__ #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) @@ -107,6 +94,7 @@ // // logging // + LLAMA_ATTRIBUTE_FORMAT(2, 3) static void llama_log_internal (llama_log_level level, const char* format, ...); static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); @@ -119,6 +107,21 @@ static void llama_log_callback_default(llama_log_level level, const char * text, // helpers // +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast<uint8_t>(src) >> 4; + return lookup[highbits]; +} + +void replace_all(std::string & s, const std::string & search, const std::string & replace) { + for (size_t pos = 0; ; pos += replace.length()) { + pos = s.find(search, pos); + if (pos == std::string::npos) break; + s.erase(pos, search.length()); + s.insert(pos, replace); + } +} + static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -143,6 +146,241 @@ static std::string format(const char * fmt, ...) { } // +// gguf constants (sync with gguf.py) +// + +enum llm_arch { + LLM_ARCH_LLAMA, + LLM_ARCH_FALCON, + LLM_ARCH_GPT2, + LLM_ARCH_GPTJ, + LLM_ARCH_GPTNEOX, + LLM_ARCH_MPT, + LLM_ARCH_UNKNOWN, +}; + +static std::map<llm_arch, std::string> LLM_ARCH_NAMES = { + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, +}; + +enum llm_kv { + LLM_KV_GENERAL_ARCHITECTURE, + LLM_KV_GENERAL_QUANTIZATION_VERSION, + LLM_KV_GENERAL_ALIGNMENT, + LLM_KV_GENERAL_NAME, + LLM_KV_GENERAL_AUTHOR, + LLM_KV_GENERAL_URL, + LLM_KV_GENERAL_DESCRIPTION, + LLM_KV_GENERAL_LICENSE, + LLM_KV_GENERAL_SOURCE_URL, + LLM_KV_GENERAL_SOURCE_HF_REPO, + + LLM_KV_CONTEXT_LENGTH, + LLM_KV_EMBEDDING_LENGTH, + LLM_KV_BLOCK_COUNT, + LLM_KV_FEED_FORWARD_LENGTH, + LLM_KV_USE_PARALLEL_RESIDUAL, + LLM_KV_TENSOR_DATA_LAYOUT, + + LLM_KV_ATTENTION_HEAD_COUNT, + LLM_KV_ATTENTION_HEAD_COUNT_KV, + LLM_KV_ATTENTION_MAX_ALIBI_BIAS, + LLM_KV_ATTENTION_CLAMP_KQV, + LLM_KV_ATTENTION_LAYERNORM_EPS, + LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + + LLM_KV_ROPE_DIMENSION_COUNT, + LLM_KV_ROPE_SCALE_LINEAR, + + LLM_KV_TOKENIZER_MODEL, + LLM_KV_TOKENIZER_LIST, + LLM_KV_TOKENIZER_TOKEN_TYPE, + LLM_KV_TOKENIZER_SCORES, + LLM_KV_TOKENIZER_MERGES, + LLM_KV_TOKENIZER_BOS_ID, + LLM_KV_TOKENIZER_EOS_ID, + LLM_KV_TOKENIZER_UNK_ID, + LLM_KV_TOKENIZER_SEP_ID, + LLM_KV_TOKENIZER_PAD_ID, + LLM_KV_TOKENIZER_HF_JSON, + LLM_KV_TOKENIZER_RWKV, +}; + +static std::map<llm_kv, std::string> LLM_KV_NAMES = { + { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, + { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, + { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, + { LLM_KV_GENERAL_NAME, "general.name" }, + { LLM_KV_GENERAL_AUTHOR, "general.author" }, + { LLM_KV_GENERAL_URL, "general.url" }, + { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, + { LLM_KV_GENERAL_LICENSE, "general.license" }, + { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" }, + { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" }, + + { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, + { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, + { LLM_KV_BLOCK_COUNT, "%s.block_count" }, + { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, + { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, + { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, + + { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, + { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, + { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, + { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, + { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, + { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, + + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, +}; + +struct LLM_KV { + LLM_KV(llm_arch arch) : arch(arch) {} + + llm_arch arch; + + std::string operator()(llm_kv kv) const { + return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str()); + } +}; + +enum llm_tensor { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_POS_EMBD, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_NORM, +}; + +static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = { + { + LLM_ARCH_LLAMA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_FALCON, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, +}; + +static llm_arch llm_arch_from_string(const std::string & name) { + for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT + if (kv.second == name) { + return kv.first; + } + } + + return LLM_ARCH_UNKNOWN; +} + +// helper to handle gguf constants +// usage: +// +// const auto tn = LLM_TN(LLM_ARCH_LLAMA); +// +// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output" +// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias" +// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight" +// +struct LLM_TN { + LLM_TN(llm_arch arch) : arch(arch) {} + + llm_arch arch; + + std::string operator()(llm_tensor tensor) const { + return LLM_TENSOR_NAMES[arch].at(tensor); + } + + std::string operator()(llm_tensor tensor, const std::string & suffix) const { + return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; + } + + std::string operator()(llm_tensor tensor, int bid) const { + return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); + } + + std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { + return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; + } +}; + +// +// gguf helpers +// + +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ + } \ +} + +// // ggml helpers // @@ -589,12 +827,13 @@ enum e_model { MODEL_7B, MODEL_13B, MODEL_30B, + MODEL_40B, MODEL_65B, MODEL_70B, }; static const size_t kB = 1024; -static const size_t MB = 1024*1024; +static const size_t MB = kB*kB; // default hparams (LLaMA 7B) struct llama_hparams { @@ -608,6 +847,7 @@ struct llama_hparams { uint32_t n_rot = 64; uint32_t n_ff = 11008; + float f_norm_eps = 1e-5; float f_norm_rms_eps = 1e-5; float rope_freq_base = 10000.0f; @@ -641,21 +881,25 @@ struct llama_hparams { struct llama_layer { // normalization - struct ggml_tensor * attention_norm; + struct ggml_tensor * attn_norm; + struct ggml_tensor * attn_norm_b; + struct ggml_tensor * attn_norm_2; + struct ggml_tensor * attn_norm_2_b; // attention struct ggml_tensor * wq; struct ggml_tensor * wk; struct ggml_tensor * wv; struct ggml_tensor * wo; + struct ggml_tensor * wqkv; // normalization struct ggml_tensor * ffn_norm; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_tensor * w1; // ffn_gate + struct ggml_tensor * w2; // ffn_down + struct ggml_tensor * w3; // ffn_up }; struct llama_kv_cache { @@ -681,10 +925,6 @@ struct llama_kv_cache { }; struct llama_vocab { - // TODO: - // - add a vector of merges - // so that we can pass it to different types of tokenizers with a common interface - using id = int32_t; using token = std::string; using ttype = llama_token_type; @@ -695,11 +935,13 @@ struct llama_vocab { ttype type; }; - llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; std::unordered_map<token, id> token_to_id; std::vector<token_data> id_to_token; + std::map<std::pair<std::string, std::string>, int> bpe_ranks; + // default LLaMA special tokens id special_bos_id = 1; id special_eos_id = 2; @@ -708,21 +950,40 @@ struct llama_vocab { id special_pad_id = -1; id linefeed_id = 13; + + int find_bpe_rank(std::string token_left, std::string token_right) const { + replace_all(token_left, " ", "Ġ"); + replace_all(token_left, "\n", "Ċ"); + replace_all(token_right, " ", "Ġ"); + replace_all(token_right, "\n", "Ċ"); + + auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); + if (it == bpe_ranks.end()) { + return -1; + } + + return it->second; + } }; struct llama_model { e_model type = MODEL_UNKNOWN; + llm_arch arch = LLM_ARCH_UNKNOWN; llama_ftype ftype = LLAMA_FTYPE_ALL_F32; + std::string name = "n/a"; + llama_hparams hparams; llama_vocab vocab; struct ggml_tensor * tok_embeddings; - struct ggml_tensor * norm; + struct ggml_tensor * output_norm; + struct ggml_tensor * output_norm_b; struct ggml_tensor * output; std::vector<llama_layer> layers; + int n_gpu_layers; // context @@ -800,8 +1061,6 @@ struct llama_context { // key + value cache for the self attention struct llama_kv_cache kv_self; - size_t mem_per_token = 0; - // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector<float> logits; bool logits_all = false; @@ -880,11 +1139,11 @@ static bool llama_kv_cache_init( // model loading and saving // -enum llama_file_version { +enum llama_fver { GGUF_FILE_VERSION_V1 = 1, }; -static const char * llama_file_version_name(llama_file_version version) { +static const char * llama_file_version_name(llama_fver version) { switch (version) { case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)"; } @@ -892,11 +1151,11 @@ static const char * llama_file_version_name(llama_file_version version) { return "unknown"; } -static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) { +static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) { char buf[256]; - snprintf(buf, sizeof(buf), "%5u", ne.at(0)); + snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0)); for (size_t i = 1; i < ne.size(); i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5u", ne.at(i)); + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i)); } return buf; } @@ -919,9 +1178,9 @@ struct llama_model_loader { bool use_mmap = false; - llama_file file; + llama_file file; llama_ftype ftype; - llama_file_version fver; + llama_fver fver; std::unique_ptr<llama_mmap> mapping; @@ -942,7 +1201,7 @@ struct llama_model_loader { n_kv = gguf_get_n_kv(ctx_gguf); n_tensors = gguf_get_n_tensors(ctx_gguf); - fver = (enum llama_file_version) gguf_get_version(ctx_gguf); + fver = (enum llama_fver ) gguf_get_version(ctx_gguf); for (int i = 0; i < n_tensors; i++) { const char * name = gguf_get_tensor_name(ctx_gguf, i); @@ -1039,6 +1298,21 @@ struct llama_model_loader { } } + std::string get_arch_name() const { + const auto kv = LLM_KV(LLM_ARCH_UNKNOWN); + + std::string arch_name; + GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE)); + + return arch_name; + } + + enum llm_arch get_arch() const { + const std::string arch_name = get_arch_name(); + + return llm_arch_from_string(arch_name); + } + const char * get_tensor_name(int i) const { return gguf_get_tensor_name(ctx_gguf, i); } @@ -1076,7 +1350,7 @@ struct llama_model_loader { return tensor; } - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) { + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); if (cur == NULL) { @@ -1244,228 +1518,279 @@ static const char * llama_model_type_name(e_model type) { case MODEL_7B: return "7B"; case MODEL_13B: return "13B"; case MODEL_30B: return "30B"; + case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; - default: GGML_ASSERT(false); + default: return "?B"; } } -static void llama_model_load_internal( - const std::string & fname, +static void llm_load_arch(llama_model_loader & ml, llama_model & model) { + model.arch = ml.get_arch(); + if (model.arch == LLM_ARCH_UNKNOWN) { + throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'"); + } +} + +static void llm_load_hparams( + llama_model_loader & ml, llama_model & model, - llama_vocab & vocab, int n_ctx, - int n_batch, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - const bool mul_mat_q, float rope_freq_base, - float rope_freq_scale, - bool low_vram, - ggml_type memory_type, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); + float rope_freq_scale) { + struct gguf_context * ctx = ml.ctx_gguf; - std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap)); - - model.n_gpu_layers = n_gpu_layers; + const auto kv = LLM_KV(model.arch); auto & hparams = model.hparams; - std::string general_name = "n/a"; - std::string general_arch = "n/a"; + // get general kv + GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); + + // get hparams kv + GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); + GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); + GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); - // read hparams + // n_head_kv is optional, default to n_head + hparams.n_head_kv = hparams.n_head; + GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + + // TODO: manually setting rope scale should override this + // rope_freq_scale (inverse of the kv) is optional { - struct gguf_context * ctx = ml->ctx_gguf; - -#define GGUF_GET(dst, func, type, req, key) \ - { \ - const int kid = gguf_find_key(ctx, key); \ - if (kid >= 0) { \ - enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - throw std::runtime_error(format("key %s has wrong type: %s", key, gguf_type_name(ktype))); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - throw std::runtime_error(format("key not found in model: %s", key)); \ - } \ + float ropescale = 1.0f; + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + if (ropescale != 1.0f) { + rope_freq_scale = 1.0f/ropescale; } + } - std::string tokenizer_name; - GGUF_GET(tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model"); + // sanity check for n_rot (optional) + { + hparams.n_rot = hparams.n_embd / hparams.n_head; - if (tokenizer_name == "llama") { - vocab.type = LLAMA_VOCAB_TYPE_SPM; - } else if (tokenizer_name == "gpt2") { - vocab.type = LLAMA_VOCAB_TYPE_BPE; - } else { - LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); - LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); - vocab.type = LLAMA_VOCAB_TYPE_SPM; + GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); + + if (hparams.n_rot != hparams.n_embd / hparams.n_head) { + throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head)); } + } - // get hparams kv - GGUF_GET(hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens"); - GGUF_GET(hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length"); - GGUF_GET(hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length"); - GGUF_GET(hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length"); - GGUF_GET(hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count"); - GGUF_GET(hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count"); - GGUF_GET(hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count"); - GGUF_GET(hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon"); + // arch-specific KVs + switch (model.arch) { + case LLM_ARCH_LLAMA: + { + GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + + switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_3B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_FALCON: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); - // n_head_kv is optional, default to n_head - hparams.n_head_kv = hparams.n_head; - GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv"); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 60: model.type = e_model::MODEL_40B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + default: (void)0; + }; - // TODO: manually setting rope scale should override this - // rope_freq_scale (inverse of the kv) is optional - float ropescale = 1.0f; - GGUF_GET(ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear"); - if (ropescale != 1.0f) { - rope_freq_scale = 1.0f/ropescale; - } + model.ftype = ml.ftype; - // get general kv - GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name"); - GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture"); + hparams.n_ctx = n_ctx; + hparams.rope_freq_base = rope_freq_base; + hparams.rope_freq_scale = rope_freq_scale; +} - // special tokens - GGUF_GET(vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id"); - GGUF_GET(vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id"); - GGUF_GET(vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id"); - GGUF_GET(vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id"); - GGUF_GET(vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id"); +// TODO: This should probably be in llama.h +static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape); -#undef GGUF_GET +static void llm_load_vocab( + llama_model_loader & ml, + llama_model & model) { + auto & vocab = model.vocab; - switch (hparams.n_layer) { - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; - default: - { - if (hparams.n_layer < 32) { - model.type = e_model::MODEL_7B; - } - } break; - } + struct gguf_context * ctx = ml.ctx_gguf; - model.ftype = ml->ftype; + const auto kv = LLM_KV(model.arch); - hparams.n_ctx = n_ctx; + const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); + if (token_idx == -1) { + throw std::runtime_error("cannot find tokenizer vocab in model file\n"); + } - // LLaMAv2 - // TODO: probably not needed - { - const auto n_gqa = hparams.n_gqa(); + const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); + if (score_idx == -1) { + throw std::runtime_error("cannot find tokenizer scores in model file\n"); + } - if (model.type == e_model::MODEL_65B && n_gqa == 8) { - LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa); - model.type = e_model::MODEL_70B; - } - } + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - hparams.rope_freq_base = rope_freq_base; - hparams.rope_freq_scale = rope_freq_scale; + const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); + if (toktype_idx == -1) { + throw std::runtime_error("cannot find token type list in GGUF file\n"); } - // read vocab + const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + + // determine vocab type { - struct gguf_context * ctx = ml->ctx_gguf; + std::string tokenizer_name; - vocab.id_to_token.resize(hparams.n_vocab); + GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL)); - const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens"); - if (token_idx == -1) { - throw std::runtime_error("cannot find tokenizer vocab in model file\n"); - } + if (tokenizer_name == "llama") { + vocab.type = LLAMA_VOCAB_TYPE_SPM; - const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores"); - if (score_idx == -1) { - throw std::runtime_error("cannot find tokenizer scores in model file\n"); - } + // default special tokens + vocab.special_bos_id = 1; + vocab.special_eos_id = 2; + vocab.special_unk_id = 0; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + } else if (tokenizer_name == "gpt2") { + vocab.type = LLAMA_VOCAB_TYPE_BPE; - const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + // read bpe merges and populate bpe ranks + const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); + if (merges_keyidx == -1) { + throw std::runtime_error("cannot find tokenizer merges in model file\n"); + } - const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type"); - if (toktype_idx == -1) { - throw std::runtime_error("cannot find token type list in GGUF file\n"); - } + const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); - const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + for (int i = 0; i < n_merges; i++) { + const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); - for (uint32_t i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); + std::string first; + std::string second; - vocab.token_to_id[word] = i; + const size_t pos = word.find(' ', 1); - auto & token_data = vocab.id_to_token[i]; - token_data.text = std::move(word); - token_data.score = scores[i]; - token_data.type = (llama_token_type) toktypes[i]; + if (pos != std::string::npos) { + first = word.substr(0, pos); + second = word.substr(pos + 1); + } - // determine the newline token: 0x0A == 10 == '\n' - if (token_data.text == "<0x0A>") { - vocab.linefeed_id = i; + vocab.bpe_ranks.emplace(std::make_pair(first, second), i); } + + // default special tokens + vocab.special_bos_id = 11; + vocab.special_eos_id = 11; + vocab.special_unk_id = -1; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + } else { + LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); + LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); + + vocab.type = LLAMA_VOCAB_TYPE_SPM; } } - { - // hparams - LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml->fver)); - LLAMA_LOG_INFO("%s: arch = %s\n", __func__, general_arch.c_str()); - LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix - LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); - LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); - LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); - LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); - LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); - LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); - LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml->n_elements*1e-9); - - // general kv - LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str()); - - // special tokens - if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } - if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } - if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } - if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } - if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } - if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } - } - - if (vocab_only) { - LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return; + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + + vocab.id_to_token.resize(n_vocab); + + for (uint32_t i = 0; i < n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab.token_to_id[word] = i; + + auto & token_data = vocab.id_to_token[i]; + token_data.text = std::move(word); + token_data.score = scores[i]; + token_data.type = (llama_token_type) toktypes[i]; } - auto & ctx = model.ctx; + // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' + vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0]; + + // special tokens + GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); + GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID)); + GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID)); + GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID)); + GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID)); +} + +static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; + + // hparams + LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); + LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); + LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix + LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); + LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); + LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); + LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); + LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); + LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); + LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); + LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); + LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); + LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9); + + // general kv + LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); + + // special tokens + if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } + if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } + if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } + if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } + if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } + if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } +} + +static void llm_load_tensors( + llama_model_loader & ml, + llama_model & model, + int n_batch, + int n_gpu_layers, + int main_gpu, + const float * tensor_split, + const bool mul_mat_q, + bool low_vram, + ggml_type memory_type, + bool use_mlock, + llama_progress_callback progress_callback, + void * progress_callback_user_data) { + model.t_start_us = ggml_time_us(); + + auto & ctx = model.ctx; + auto & hparams = model.hparams; + + model.n_gpu_layers = n_gpu_layers; size_t ctx_size; size_t mmapped_size; - ml->calc_sizes(ctx_size, mmapped_size); + ml.calc_sizes(ctx_size, mmapped_size); LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); @@ -1480,7 +1805,7 @@ static void llama_model_load_internal( struct ggml_init_params params = { /*.mem_size =*/ model.buf.size, /*.mem_buffer =*/ model.buf.data, - /*.no_alloc =*/ ml->use_mmap, + /*.no_alloc =*/ ml.use_mmap, }; model.ctx = ggml_init(params); @@ -1509,75 +1834,146 @@ static void llama_model_load_internal( // prepare memory for the weights size_t vram_weights = 0; { - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_embd_gqa = hparams.n_embd_gqa(); - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; + const int64_t n_embd = hparams.n_embd; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_layer = hparams.n_layer; + const int64_t n_vocab = hparams.n_vocab; - model.tok_embeddings = ml->create_tensor(ctx, TN_TOKEN_EMBD, {n_embd, n_vocab}, GGML_BACKEND_CPU); + const auto tn = LLM_TN(model.arch); - // "output" tensor - { - ggml_backend backend_norm; - ggml_backend backend_output; - if (n_gpu_layers > int(n_layer)) { // NOLINT - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU + switch (model.arch) { + case LLM_ARCH_LLAMA: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + // output + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } - model.norm = ml->create_tensor(ctx, TN_OUTPUT_NORM, {n_embd}, backend_norm); - model.output = ml->create_tensor(ctx, TN_OUTPUT, {n_embd, n_vocab}, backend_output); - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } - } + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } - const uint32_t n_ff = hparams.n_ff; + const uint32_t n_ff = hparams.n_ff; - const int i_gpu_start = n_layer - n_gpu_layers; + const int i_gpu_start = n_layer - n_gpu_layers; - model.layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + model.layers.resize(n_layer); - auto & layer = model.layers[i]; - layer.attention_norm = ml->create_tensor(ctx, format(TN_ATTN_NORM, i), {n_embd}, backend); + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT - layer.wq = ml->create_tensor(ctx, format(TN_ATTN_Q, i), {n_embd, n_embd}, backend_split); - layer.wk = ml->create_tensor(ctx, format(TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml->create_tensor(ctx, format(TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml->create_tensor(ctx, format(TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split); + auto & layer = model.layers[i]; - layer.ffn_norm = ml->create_tensor(ctx, format(TN_FFN_NORM, i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml->create_tensor(ctx, format(TN_FFN_GATE, i), {n_embd, n_ff}, backend_split); - layer.w2 = ml->create_tensor(ctx, format(TN_FFN_DOWN, i), { n_ff, n_embd}, backend_split); - layer.w3 = ml->create_tensor(ctx, format(TN_FFN_UP, i), {n_embd, n_ff}, backend_split); + layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); + layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); - } - } + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + + layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } + } + } break; + case LLM_ARCH_FALCON: + { + // TODO: CPU-only for now + + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + // output + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { + layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); + layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); + } + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + } + } break; + default: + throw std::runtime_error("unknown architecture"); + }; } - ml->done_getting_tensors(); + ml.done_getting_tensors(); // print memory requirements { @@ -1589,8 +1985,7 @@ static void llama_model_load_internal( mmapped_size - vram_weights; // weights in VRAM not in memory // this is the memory required by one llama_state - const size_t mem_required_state = - scale*hparams.kv_size(); + const size_t mem_required_state = scale*hparams.kv_size(); LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); @@ -1640,8 +2035,8 @@ static void llama_model_load_internal( } // populate `tensors_by_name` - for (int i = 0; i < ml->n_tensors; ++i) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, ml->get_tensor_name(i)); + for (int i = 0; i < ml.n_tensors; ++i) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i)); model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } @@ -1652,13 +2047,13 @@ static void llama_model_load_internal( } #endif - ml->load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); + ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); } - model.mapping = std::move(ml->mapping); + model.mapping = std::move(ml.mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration @@ -1668,7 +2063,6 @@ static void llama_model_load_internal( static bool llama_model_load( const std::string & fname, llama_model & model, - llama_vocab & vocab, int n_ctx, int n_batch, int n_gpu_layers, @@ -1685,17 +2079,36 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, - main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, - use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); - return true; + std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap)); + + llm_load_arch (*ml, model); + llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale); + llm_load_vocab (*ml, model); + + llm_load_print_meta(*ml, model); + + if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { + throw std::runtime_error("vocab size mismatch"); + } + + if (vocab_only) { + LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); + return true; + } + + llm_load_tensors( + *ml, model, n_batch, n_gpu_layers, + main_gpu, tensor_split, mul_mat_q, low_vram, memory_type, + use_mlock, progress_callback, progress_callback_user_data); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); return false; } + + return true; } -static struct ggml_cgraph * llama_build_graph( +static struct ggml_cgraph * llm_build_llama( llama_context & lctx, const llama_token * tokens, const float * embd, @@ -1729,8 +2142,7 @@ static struct ggml_cgraph * llama_build_graph( const int n_gpu_layers = model.n_gpu_layers; - auto & mem_per_token = lctx.mem_per_token; - auto & buf_compute = lctx.buf_compute; + auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, @@ -1820,8 +2232,8 @@ static struct ggml_cgraph * llama_build_graph( offload_func(cur); ggml_set_name(cur, "rms_norm_0"); - // cur = cur*attention_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); + // cur = cur*attn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); offload_func(cur); ggml_set_name(cur, "attention_norm_0"); } @@ -1872,10 +2284,7 @@ static struct ggml_cgraph * llama_build_graph( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); offload_func_kq(Q); ggml_set_name(Q, "Q"); @@ -2005,14 +2414,16 @@ static struct ggml_cgraph * llama_build_graph( inpL = cur; } + cur = inpL; + // norm { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); + cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); offload_func_nr(cur); ggml_set_name(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.norm); + cur = ggml_mul(ctx0, cur, model.output_norm); // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); } @@ -2021,20 +2432,349 @@ static struct ggml_cgraph * llama_build_graph( cur = ggml_mul_mat(ctx0, model.output, cur); ggml_set_name(cur, "result_output"); - // logits -> probs - //cur = ggml_soft_max_inplace(ctx0, cur); - ggml_build_forward_expand(gf, cur); - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; + ggml_free(ctx0); + + return gf; +} + +static struct ggml_cgraph * llm_build_falcon( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past) { + + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_eps = hparams.f_norm_eps; + + const int n_gpu_layers = model.n_gpu_layers; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + if (tokens) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } + ggml_set_name(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } } + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; + + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + // + // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal + // in that case ggml_cuda_assign_buffers has no effect + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + // self-attention + // TODO: refactor into common function (shared with LLaMA) + { + attn_norm = ggml_norm(ctx0, inpL, norm_eps); + offload_func(attn_norm); + + attn_norm = ggml_add(ctx0, + ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), + model.layers[il].attn_norm_b); + offload_func(attn_norm->src[0]); + offload_func(attn_norm); + + if (model.layers[il].attn_norm_2) { // Falcon-40B + cur = ggml_norm(ctx0, inpL, norm_eps); + offload_func(cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), + model.layers[il].attn_norm_2_b); + offload_func(cur->src[0]); + offload_func(cur); + } else { // Falcon 7B + cur = attn_norm; + } + + // compute QKV + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + offload_func_kq(cur); + + // Note that the strides for Kcur, Vcur are set up so that the + // resulting views are misaligned with the tensor's storage + // (by applying the K/V offset we shift the tensor's original + // view to stick out behind the viewed QKV tensor's allocated + // memory, so to say). This is ok because no actual accesses + // happen to that out-of-range memory, but it can require some + // trickery when trying to accurately dump these views for + // debugging. + + const size_t wsize = ggml_type_size(cur->type); + + struct ggml_tensor * tmpq = ggml_view_3d( + ctx0, cur, n_embd_head, n_head, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + 0); + offload_func_kq(tmpq); + + struct ggml_tensor * tmpk = ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * n_head); + offload_func_kq(tmpk); + + struct ggml_tensor * tmpv = ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * (n_head + n_head_kv)); + offload_func_v(tmpv); + + // using mode = 2 for neox mode + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale); + offload_func_kq(Kcur); + + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); + offload_func_v(Vcur); + offload_func_v(Vcur->src[0]->src[0]); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + offload_func_kq(k); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); + offload_func_v(v); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + offload_func_kq(Q); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); + ggml_set_name(K, "K"); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + ggml_set_name(KQ, "KQ"); + + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + ggml_set_name(KQ_masked, "KQ_masked"); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + offload_func_v(V); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + ggml_set_name(KQV, "KQV"); + + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + ggml_set_name(KQV_merged, "KQV_merged"); + + cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + offload_func_v(cur); + ggml_set_name(cur, "KQV_merged_contiguous"); + + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + offload_func(cur); + ggml_set_name(cur, "result_wo"); + } + + struct ggml_tensor * attn_out = cur; + + // feed forward + { + struct ggml_tensor * inpFF = attn_norm; + + cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); + + // TODO: this is temporary needed to introduce artificial dependency between FF and ATTN + // adding this, because there seems to be a bug in the Metal concurrency optimization + // without this line, the results are non-deterministic and wrong + cur->src[2] = attn_out; + offload_func(cur); + + cur = ggml_gelu(ctx0, cur); + offload_func(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + offload_func(cur); + } + + cur = ggml_add(ctx0, cur, attn_out); + offload_func(cur); + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + { + cur = ggml_norm(ctx0, cur, norm_eps); + offload_func_nr(cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.output_norm), + model.output_norm_b); + ggml_set_name(cur, "result_norm"); + } + + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); return gf; } +static struct ggml_cgraph * llama_build_graph( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past) { + const auto & model = lctx.model; + + struct ggml_cgraph * result = NULL; + + switch (model.arch) { + case LLM_ARCH_LLAMA: + { + result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past); + } break; + case LLM_ARCH_FALCON: + { + result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past); + } break; + default: + GGML_ASSERT(false); + }; + + return result; +} + // evaluate the transformer // // - lctx: llama context @@ -2077,8 +2817,8 @@ static bool llama_eval_internal( GGML_ASSERT(!!kv_self.ctx); - const int64_t n_embd = hparams.n_embd; - const int64_t n_vocab = hparams.n_vocab; + const int64_t n_embd = hparams.n_embd; + const int64_t n_vocab = hparams.n_vocab; ggml_allocr_reset(lctx.alloc); @@ -2108,11 +2848,11 @@ static bool llama_eval_internal( // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - GGML_ASSERT(strcmp(res->name, "result_output") == 0); - GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + GGML_ASSERT(strcmp(res->name, "result_output") == 0); + GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); #if GGML_USE_MPI const int64_t n_layer = hparams.n_layer; @@ -2271,13 +3011,7 @@ static std::string llama_unescape_whitespace(const std::string& word) { return word; } -static size_t utf8_len(char src) { - const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - uint8_t highbits = static_cast<uint8_t>(src) >> 4; - return lookup[highbits]; -} - -struct llama_sp_symbol { +struct llm_symbol { using index = int; index prev; index next; @@ -2285,33 +3019,35 @@ struct llama_sp_symbol { size_t n; }; -static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable"); +static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable"); -struct llama_sp_bigram { +// SPM tokenizer +// original implementation: +// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 + +struct llm_bigram_spm { struct comparator { - bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { + bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) { return (l.score < r.score) || (l.score == r.score && l.left > r.left); } }; - using queue_storage = std::vector<llama_sp_bigram>; - using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>; - llama_sp_symbol::index left; - llama_sp_symbol::index right; + using queue_storage = std::vector<llm_bigram_spm>; + using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>; + llm_symbol::index left; + llm_symbol::index right; float score; size_t size; }; -// original implementation: -// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 -struct llama_tokenizer { - llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} +struct llm_tokenizer_spm { + llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {} void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { // split string into utf8 chars int index = 0; size_t offs = 0; while (offs < text.size()) { - llama_sp_symbol sym; + llm_symbol sym; size_t len = utf8_len(text[offs]); GGML_ASSERT(offs + len <= text.size()); sym.text = text.c_str() + offs; @@ -2320,21 +3056,21 @@ struct llama_tokenizer { sym.prev = index - 1; sym.next = offs == text.size() ? -1 : index + 1; index++; - symbols_.emplace_back(sym); + symbols.emplace_back(sym); } // seed the work queue with all possible 2-character tokens. - for (size_t i = 1; i < symbols_.size(); ++i) { + for (size_t i = 1; i < symbols.size(); ++i) { try_add_bigram(i - 1, i); } // keep substituting the highest frequency pairs for as long as we can. - while (!work_queue_.empty()) { - auto bigram = work_queue_.top(); - work_queue_.pop(); + while (!work_queue.empty()) { + auto bigram = work_queue.top(); + work_queue.pop(); - auto & left_sym = symbols_[bigram.left]; - auto & right_sym = symbols_[bigram.right]; + auto & left_sym = symbols[bigram.left]; + auto & right_sym = symbols[bigram.right]; // if one of the symbols already got merged, skip it. if (left_sym.n == 0 || right_sym.n == 0 || @@ -2351,7 +3087,7 @@ struct llama_tokenizer { // remove the right sym from the chain left_sym.next = right_sym.next; if (right_sym.next >= 0) { - symbols_[right_sym.next].prev = bigram.left; + symbols[right_sym.next].prev = bigram.left; } // find more substitutions @@ -2359,19 +3095,19 @@ struct llama_tokenizer { try_add_bigram(bigram.left, left_sym.next); } - for (int i = 0; i != -1; i = symbols_[i].next) { - auto & symbol = symbols_[i]; + for (int i = 0; i != -1; i = symbols[i].next) { + auto & symbol = symbols[i]; resegment(symbol, output); } } private: - void resegment(llama_sp_symbol &symbol, std::vector<llama_vocab::id> &output) { + void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) { auto text = std::string(symbol.text, symbol.n); - auto token = vocab_.token_to_id.find(text); + auto token = vocab.token_to_id.find(text); // Do we need to support is_unused? - if (token != vocab_.token_to_id.end()) { + if (token != vocab.token_to_id.end()) { output.push_back((*token).second); return; } @@ -2381,14 +3117,14 @@ private: if (p == rev_merge.end()) { // output any symbols that did not form tokens as bytes. for (int j = 0; j < (int)symbol.n; ++j) { - llama_vocab::id token_id = llama_byte_to_token(vocab_, symbol.text[j]); + llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]); output.push_back(token_id); } return; } - resegment(symbols_[p->second.first], output); - resegment(symbols_[p->second.second], output); + resegment(symbols[p->second.first], output); + resegment(symbols[p->second.second], output); } void try_add_bigram(int left, int right) { @@ -2396,56 +3132,261 @@ private: return; } - const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n); - auto token = vocab_.token_to_id.find(text); + const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n); + auto token = vocab.token_to_id.find(text); - if (token == vocab_.token_to_id.end()) { + if (token == vocab.token_to_id.end()) { return; } - if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) { + if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) { return; } - const auto &tok_data = vocab_.id_to_token[(*token).second]; + const auto & tok_data = vocab.id_to_token[(*token).second]; - llama_sp_bigram bigram; - bigram.left = left; + llm_bigram_spm bigram; + bigram.left = left; bigram.right = right; bigram.score = tok_data.score; - bigram.size = text.size(); - work_queue_.push(bigram); + bigram.size = text.size(); + + work_queue.push(bigram); // Do we need to support is_unused? rev_merge[text] = std::make_pair(left, right); } - const llama_vocab & vocab_; - std::vector<llama_sp_symbol> symbols_; - llama_sp_bigram::queue work_queue_; - std::map<std::string, std::pair<int, int> > rev_merge; + const llama_vocab & vocab; + + std::vector<llm_symbol> symbols; + llm_bigram_spm::queue work_queue; + + std::map<std::string, std::pair<int, int>> rev_merge; +}; + +// BPE tokenizer +// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] +// tried to simplify unicode stuff, so most likely does not work 100% correctly! + +// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused + +struct llm_bigram_bpe { + struct comparator { + bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) { + return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); + } + }; + + using queue_storage = std::vector<llm_bigram_bpe>; + using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>; + llm_symbol::index left; + llm_symbol::index right; + std::string text; + int rank; + size_t size; +}; + +struct llm_tokenizer_bpe { + llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; } + + void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { + int final_prev_index = -1; + auto word_collection = bpe_gpt2_preprocess(text); + + symbols_final.clear(); + + for (auto & word : word_collection) { + work_queue = llm_bigram_bpe::queue(); + symbols.clear(); + + int index = 0; + size_t offset = 0; + + while (offset < word.size()) { + llm_symbol sym; + size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset])); + sym.text = word.c_str() + offset; + sym.n = 1; + sym.n = char_len; + offset += sym.n; + sym.prev = index - 1; + sym.next = offset == word.size() ? -1 : index + 1; + index++; + symbols.emplace_back(sym); + } + for (size_t i = 1; i < symbols.size(); ++i) { + add_new_bigram(i - 1, i); + } + + // build token(s) + while (!work_queue.empty()) { + auto bigram = work_queue.top(); + work_queue.pop(); + + auto & left_symbol = symbols[bigram.left]; + auto & right_symbol = symbols[bigram.right]; + + if (left_symbol.n == 0 || right_symbol.n == 0) { + continue; + } + std::string left_token = std::string(left_symbol.text, left_symbol.n); + std::string right_token = std::string(right_symbol.text, right_symbol.n); + if (left_token + right_token != bigram.text) { + continue; // Skip this bigram if it's outdated + } + + // merge the right sym into the left one + left_symbol.n += right_symbol.n; + right_symbol.n = 0; + + // remove the right sym from the chain + left_symbol.next = right_symbol.next; + if (right_symbol.next >= 0) { + symbols[right_symbol.next].prev = bigram.left; + } + + add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol + add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol + } + + // add the fnished tokens to the final list keeping correct order for next and prev + for (auto & sym : symbols) { + if (sym.n > 0) { + sym.prev = final_prev_index; + sym.next = -1; + if (final_prev_index != -1) { + symbols_final[final_prev_index].next = symbols_final.size(); + } + symbols_final.emplace_back(sym); + final_prev_index = symbols_final.size() - 1; + } + } + } + + symbols = symbols_final; + + if (!symbols.empty()) { + for (int i = 0; i != -1; i = symbols[i].next) { + auto & symbol = symbols[i]; + if (symbol.n == 0) { + continue; + } + + const std::string str = std::string(symbol.text, symbol.n); + const auto token = vocab.token_to_id.find(str); + + if (token == vocab.token_to_id.end()) { + for (auto j = str.begin(); j != str.end(); ++j) { + std::string byte_str(1, *j); + auto token_multibyte = vocab.token_to_id.find(byte_str); + if (token_multibyte == vocab.token_to_id.end()) { + fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); + } + output.push_back((*token_multibyte).second); + } + } else { + output.push_back((*token).second); + } + } + } + } + +private: + void add_new_bigram(int left, int right) { + if (left == -1 || right == -1) { + return; + } + + std::string left_token = std::string(symbols[left].text, symbols[left].n); + std::string right_token = std::string(symbols[right].text, symbols[right].n); + + int rank_found = -1; + + rank_found = vocab.find_bpe_rank(left_token, right_token); + + if (rank_found < 0) { + return; + } + + llm_bigram_bpe bigram; + + bigram.left = left; + bigram.right = right; + bigram.text = left_token + right_token; + bigram.size = left_token.size() + right_token.size(); + bigram.rank = rank_found; + + work_queue.push(bigram); + } + + // probably not 100% correct + // TODO: this is quite slow - how to make it more efficient? + static std::vector<std::string> bpe_gpt2_preprocess(std::string text) { + std::vector<std::string> words; + + // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 + const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; + const std::regex re(pattern); + std::smatch m; + + while (std::regex_search(text, m, re)) { + for (auto x : m) { + words.push_back(x); + } + text = m.suffix(); + } + + return words; + } + + bool flag_g2ws = false; + + const llama_vocab & vocab; + + std::vector<llm_symbol> symbols; + std::vector<llm_symbol> symbols_final; + + llm_bigram_bpe::queue work_queue; }; static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) { - llama_tokenizer tokenizer(vocab); std::vector<llama_vocab::id> output; if (raw_text.empty()) { return output; } - if (bos) { - output.push_back(vocab.special_bos_id); - } + switch (vocab.type) { + case LLAMA_VOCAB_TYPE_SPM: + { + llm_tokenizer_spm tokenizer(vocab); - std::string text; - if (escape) { - text = llama_escape_whitespace(raw_text); - } else { - text = raw_text; - } + if (bos) { + output.push_back(vocab.special_bos_id); + } + + std::string text; + if (escape) { + text = llama_escape_whitespace(raw_text); + } else { + text = raw_text; + } + + tokenizer.tokenize(text, output); + } break; + case LLAMA_VOCAB_TYPE_BPE: + { + llm_tokenizer_bpe tokenizer(vocab, escape); + + if (bos && vocab.special_bos_id != -1) { + output.push_back(vocab.special_bos_id); + } + + tokenizer.tokenize(raw_text, output); + } break; + }; - tokenizer.tokenize(text, output); return output; } @@ -3449,13 +4390,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file - gguf_set_kv (ctx_out, model_loader->ctx_gguf); + gguf_set_kv (ctx_out, ml->ctx_gguf); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); @@ -3463,8 +4404,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s int n_attention_wv = 0; int n_feed_forward_w2 = 0; - for (int i = 0; i < model_loader->n_tensors; ++i) { - struct ggml_tensor * meta = model_loader->get_tensor_meta(i); + for (int i = 0; i < ml->n_tensors; ++i) { + struct ggml_tensor * meta = ml->get_tensor_meta(i); const std::string name = ggml_get_name(meta); @@ -3498,8 +4439,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector<uint8_t> work; // populate the original tensors so we get an initial meta data - for (int i = 0; i < model_loader->n_tensors; ++i) { - struct ggml_tensor * meta = model_loader->get_tensor_meta(i); + for (int i = 0; i < ml->n_tensors; ++i) { + struct ggml_tensor * meta = ml->get_tensor_meta(i); gguf_add_tensor(ctx_out, meta); } @@ -3512,17 +4453,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // placeholder for the meta data ::zeros(fout, meta_size); - for (int i = 0; i < model_loader->n_tensors; ++i) { - struct ggml_tensor * tensor = model_loader->get_tensor_meta(i); + for (int i = 0; i < ml->n_tensors; ++i) { + struct ggml_tensor * tensor = ml->get_tensor_meta(i); const std::string name = ggml_get_name(tensor); read_data.resize(ggml_nbytes(tensor)); tensor->data = read_data.data(); - model_loader->load_data_for(tensor); + ml->load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", - ++idx, model_loader->n_tensors, + ++idx, ml->n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type)); @@ -3548,7 +4489,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type = quantized_type; #ifdef GGML_USE_K_QUANTS // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name == TN_OUTPUT) { + const auto tn = LLM_TN(ml->get_arch()); + + if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K == 0 && ny % QK_K == 0) { @@ -3600,10 +4543,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } if (convert_incompatible_tensor) { - if (name == TN_OUTPUT) { + if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); - } else if (name == TN_TOKEN_EMBD) { + } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); } else { @@ -3785,28 +4728,28 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const } // load base model - std::unique_ptr<llama_model_loader> model_loader; + std::unique_ptr<llama_model_loader> ml; ggml_context * base_ctx = NULL; std::vector<uint8_t> base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); size_t ctx_size; size_t mmapped_size; - model_loader->calc_sizes(ctx_size, mmapped_size); + ml->calc_sizes(ctx_size, mmapped_size); base_buf.resize(ctx_size); ggml_init_params base_params; base_params.mem_size = base_buf.size(); base_params.mem_buffer = base_buf.data(); - base_params.no_alloc = model_loader->use_mmap; + base_params.no_alloc = ml->use_mmap; base_ctx = ggml_init(base_params); // maybe this should in llama_model_loader - if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file, /* prefetch */ 0, ggml_is_numa())); + if (ml->use_mmap) { + ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); } } @@ -3910,18 +4853,19 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const #endif // GGML_USE_CUBLAS ggml_tensor * base_t; - if (model_loader) { - struct gguf_context * ctx_gguf = model_loader->ctx_gguf; + if (ml) { + struct gguf_context * ctx_gguf = ml->ctx_gguf; // load from base model if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { + // TODO: throw LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } // TODO: not tested!! maybe not working! - base_t = model_loader->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); - model_loader->load_data_for(base_t); + base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + ml->load_data_for(base_t); } else { base_t = dest_t; } @@ -4096,7 +5040,23 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + *cur_percentage_p = percentage; + LLAMA_LOG_INFO("."); + if (percentage >= 100) { + LLAMA_LOG_INFO("\n"); + } + } + }; + } + + if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { @@ -4126,22 +5086,6 @@ struct llama_context * llama_new_context_with_model( params.seed = time(NULL); } - unsigned cur_percentage = 0; - if (params.progress_callback == NULL) { - params.progress_callback_user_data = &cur_percentage; - params.progress_callback = [](float progress, void * ctx) { - unsigned * cur_percentage_p = (unsigned *) ctx; - unsigned percentage = (unsigned) (100 * progress); - while (percentage > *cur_percentage_p) { - *cur_percentage_p = percentage; - LLAMA_LOG_INFO("."); - if (percentage >= 100) { - LLAMA_LOG_INFO("\n"); - } - } - }; - } - ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; @@ -4279,13 +5223,14 @@ struct llama_context * llama_new_context_with_model( struct llama_context * llama_init_from_file( const char * path_model, struct llama_context_params params) { - struct llama_model * model = llama_load_model_from_file(path_model, params); if (!model) { return nullptr; } + struct llama_context * ctx = llama_new_context_with_model(model, params); ctx->model_owner = true; + return ctx; } @@ -4305,6 +5250,10 @@ int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } +enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) { + return ctx->model.vocab.type; +} + int llama_model_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } @@ -4318,7 +5267,10 @@ int llama_model_n_embd(const struct llama_model * model) { } int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { - return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); + return snprintf(buf, buf_size, "%s %s %s", + model->name.c_str(), + llama_model_type_name(model->type), + llama_model_ftype_name(model->ftype).c_str()); } int llama_model_quantize( @@ -4839,26 +5791,6 @@ int llama_tokenize( return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); } -int llama_tokenize_bpe( - struct llama_context * ctx, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - auto res = llama_tokenize_internal(ctx->model.vocab, text, add_bos, false); - - if (n_max_tokens < (int) res.size()) { - LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); - return -((int) res.size()); - } - - for (size_t i = 0; i < res.size(); i++) { - tokens[i] = res[i]; - } - - return res.size(); -} - int llama_tokenize_with_model( const struct llama_model * model, const char * text, @@ -4884,18 +5816,6 @@ int llama_token_to_str(const struct llama_context * ctx, llama_token token, char return llama_token_to_str_with_model(&ctx->model, token, buf, length); } -int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) { - if (0 <= token && token < llama_model_n_vocab(&ctx->model)) { - std::string result = ctx->model.vocab.id_to_token[token].text; - if (length < (int) result.length()) { - return -result.length(); - } - memcpy(buf, result.c_str(), result.length()); - return result.length(); - } - return 0; -} - // does not write null-terminator to str int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { if (0 <= token && token < llama_model_n_vocab(model)) { |