diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-08-12 15:14:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-08-12 15:14:32 +0200 |
commit | 8f43e551038af2547b5c01d0e9edd641c0e4bd29 (patch) | |
tree | 07a4373620a9381d0b5c7189a475990a6feb48a5 /src/llama.cpp | |
parent | f5d1af61d79fb53ccfbac2e665e43208c07b083d (diff) |
Merge mainline - Aug 12 2024 (#17)
* Merge mainline
* Fix after merge
* Remove CI check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'src/llama.cpp')
-rw-r--r-- | src/llama.cpp | 2213 |
1 files changed, 1177 insertions, 1036 deletions
diff --git a/src/llama.cpp b/src/llama.cpp index 71be05f9..dce58dfe 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -102,7 +102,6 @@ #endif // bump if necessary -#define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 @@ -123,20 +122,6 @@ static std::string trim(const std::string & str) { return str.substr(start, end - start); } -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; - } - s = std::move(result); -} - static bool is_float_close(float a, float b, float abs_tol) { // Check for non-negative tolerance if (abs_tol < 0.0) { @@ -224,6 +209,7 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_BITNET, LLM_ARCH_T5, + LLM_ARCH_T5ENCODER, LLM_ARCH_JAIS, LLM_ARCH_UNKNOWN, }; @@ -268,6 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -364,6 +351,7 @@ enum llm_kv { LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, @@ -461,6 +449,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -1208,6 +1197,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA }, }, { + LLM_ARCH_CHATGLM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, + { LLM_ARCH_BITNET, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, @@ -1261,6 +1265,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA }, }, { + LLM_ARCH_T5ENCODER, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, + { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, + { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, + { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, + { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, + { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, + { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, + { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, + { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, + { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, + { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, + }, + }, + { LLM_ARCH_JAIS, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, @@ -2245,8 +2267,7 @@ struct llama_hparams { return n_head_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_head_kv(uint32_t il = 0) const { @@ -2254,8 +2275,7 @@ struct llama_hparams { return n_head_kv_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_ff(uint32_t il = 0) const { @@ -2263,8 +2283,7 @@ struct llama_hparams { return n_ff_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_gqa(uint32_t il = 0) const { @@ -2441,6 +2460,7 @@ struct llama_layer { // long rope factors struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_short = nullptr; + struct ggml_tensor * rope_freqs = nullptr; // bitnet scale struct ggml_tensor * wq_scale; @@ -2922,7 +2942,7 @@ static bool llama_kv_cache_init( // TODO: find a nicer way to add other recurrent model architectures cache.recurrent = model.arch == LLM_ARCH_MAMBA; - cache.v_trans = !cparams.flash_attn; + cache.v_trans = !cache.recurrent && !cparams.flash_attn; cache.head = 0; cache.size = kv_size; @@ -3556,6 +3576,15 @@ namespace GGUFMeta { using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>; +// TODO: update when needed or think of some clever automatic way to do this +static size_t llama_model_max_nodes(const llama_model & /*model*/) { + //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B + // return 32768; + //} + + return 8192; +} + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -4467,14 +4496,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_K: return "IQ4_K - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ5_K: return "IQ5_K - 5.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ6_K: return "IQ6_K - 6.6 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.625 bpw Bitnet"; + case LLAMA_FTYPE_MOSTLY_IQ2_BN: return "IQ2_BN - 2.00 bpw Bitnet"; + case LLAMA_FTYPE_MOSTLY_IQ2_TN: return "IQT_BN - 2.06 bpw TriLM"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; - case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.625 bpw Bitnet"; - case LLAMA_FTYPE_MOSTLY_IQ2_BN: return "IQ2_BN - 2.00 bpw Bitnet"; - case LLAMA_FTYPE_MOSTLY_IQ2_TN: return "IQT_BN - 2.06 bpw TriLM"; default: return "unknown, may not work"; } @@ -4898,6 +4927,22 @@ static void llm_load_hparams( case 40: model.type = e_model::MODEL_14B; break; default: model.type = e_model::MODEL_UNKNOWN; } + + // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931 + if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) { + // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct + hparams.n_swa = 2047; + } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) { + // default value for Phi-3-mini-128k-instruct + hparams.n_swa = 262144; + } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) { + // default value for Phi-3-medium-128k-instruct + hparams.n_swa = 131072; + } + bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (!found_swa && hparams.n_swa == 0) { + throw std::runtime_error("invalid value for sliding_window"); + } } break; case LLM_ARCH_PLAMO: { @@ -4965,6 +5010,7 @@ static void llm_load_hparams( hparams.attn_soft_cap = true; switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_2B; break; case 42: model.type = e_model::MODEL_9B; break; case 46: model.type = e_model::MODEL_27B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -5194,6 +5240,12 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_T5ENCODER: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); + model.type = e_model::MODEL_UNKNOWN; + } break; case LLM_ARCH_JAIS: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -5588,6 +5640,7 @@ static void llm_load_vocab( { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, + { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id }, }; for (const auto & it : special_token_types) { @@ -5640,6 +5693,17 @@ static void llm_load_vocab( } } } + + // find EOM token: "<|eom_id|>" + // + // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID + // for now, we apply this workaround to find the EOM token based on its text + if (vocab.special_eom_id == -1) { + const auto & t = vocab.token_to_id.find("<|eom_id|>"); + if (t != vocab.token_to_id.end()) { + vocab.special_eom_id = t->second; + } + } } // build special tokens cache @@ -6062,6 +6126,8 @@ static bool llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + if (n_expert == 0) { layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); @@ -7423,6 +7489,42 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_T5ENCODER: + { + const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts; + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}); + + layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; case LLM_ARCH_JAIS: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -8104,7 +8206,7 @@ static struct ggml_tensor * llm_build_moe_ffn( cb(gate, "ffn_moe_gelu", il); } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] @@ -8431,7 +8533,7 @@ struct llm_build_context { } struct ggml_cgraph * build_k_shift() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); GGML_ASSERT(kv_self.size == n_ctx); @@ -8462,7 +8564,7 @@ struct llm_build_context { } struct ggml_cgraph * build_s_copy() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); GGML_ASSERT(kv_self.recurrent); @@ -8485,7 +8587,7 @@ struct llm_build_context { } struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); for (uint32_t i = 0; i < ids.size(); ++i) { const uint32_t id = ids[i]; @@ -8563,6 +8665,10 @@ struct llm_build_context { // choose long/short freq factors based on the context size const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { return model.layers[il].rope_long; } @@ -8667,8 +8773,8 @@ struct llm_build_context { } break; default: { - GGML_ASSERT(false && "unknown pooling type"); - } break; + GGML_ABORT("unknown pooling type"); + } } cb(cur, "result_embd_pooled", -1); @@ -8726,7 +8832,7 @@ struct llm_build_context { } struct ggml_cgraph * build_llama() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -8757,6 +8863,9 @@ struct llm_build_context { // self-attention { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = build_rope_factors(il); + // compute Q and K and RoPE them struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); @@ -8780,14 +8889,14 @@ struct llm_build_context { } Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8869,7 +8978,7 @@ struct llm_build_context { } struct ggml_cgraph * build_baichuan() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8923,7 +9032,7 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -8984,7 +9093,7 @@ struct llm_build_context { } struct ggml_cgraph * build_xverse() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9087,7 +9196,7 @@ struct llm_build_context { } struct ggml_cgraph * build_falcon() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9207,7 +9316,7 @@ struct llm_build_context { } struct ggml_cgraph * build_grok() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -9364,7 +9473,7 @@ struct llm_build_context { } struct ggml_cgraph * build_dbrx() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -9490,7 +9599,7 @@ struct llm_build_context { } struct ggml_cgraph * build_starcoder() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9594,7 +9703,7 @@ struct llm_build_context { } struct ggml_cgraph * build_refact() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9688,7 +9797,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bert() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9882,7 +9991,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bloom() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9983,7 +10092,7 @@ struct llm_build_context { } struct ggml_cgraph * build_mpt() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10273,7 +10382,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10385,7 +10494,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10497,7 +10606,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen2moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -10643,7 +10752,7 @@ struct llm_build_context { } struct ggml_cgraph * build_phi2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10764,7 +10873,7 @@ struct llm_build_context { } struct ggml_cgraph * build_phi3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10996,7 +11105,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gpt2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11101,7 +11210,7 @@ struct llm_build_context { } struct ggml_cgraph * build_codeshell() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11212,7 +11321,7 @@ struct llm_build_context { } struct ggml_cgraph * build_orion() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11330,7 +11439,7 @@ struct llm_build_context { } struct ggml_cgraph * build_internlm2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11451,7 +11560,7 @@ struct llm_build_context { // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 // based on the original build_llama() function struct ggml_cgraph * build_minicpm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11595,7 +11704,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gemma() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -11703,7 +11812,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gemma2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -11753,9 +11862,10 @@ struct llm_build_context { // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e switch (model.type) { + case e_model::MODEL_2B: case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; cb(Qcur, "Qcur_scaled", il); @@ -11838,7 +11948,7 @@ struct llm_build_context { struct ggml_cgraph * build_starcoder2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11957,7 +12067,7 @@ struct llm_build_context { } struct ggml_cgraph * build_mamba() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t d_model = n_embd; const int64_t d_conv = hparams.ssm_d_conv; @@ -12106,7 +12216,7 @@ struct llm_build_context { struct ggml_cgraph * build_command_r() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12260,7 +12370,7 @@ struct llm_build_context { // * removed bias // * removed MoE struct ggml_cgraph * build_olmo() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12384,7 +12494,7 @@ struct llm_build_context { } struct ggml_cgraph * build_openelm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12509,7 +12619,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gptneox() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12651,7 +12761,7 @@ struct llm_build_context { } struct ggml_cgraph * build_arctic() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12783,7 +12893,7 @@ struct llm_build_context { } struct ggml_cgraph * build_deepseek2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -13011,7 +13121,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bitnet() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -13233,8 +13343,8 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_t5() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * build_t5_encoder() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -13248,310 +13358,330 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - if (lctx.is_encoding) { - struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + GGML_ASSERT(lctx.is_encoding); + struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - // self-attention - { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur); - cb(Qcur, "Qcur", il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_enc, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur); - cb(Kcur, "Kcur", il); + // self-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); + cb(Qcur, "Qcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur); - cb(Vcur, "Vcur", il); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); + cb(Kcur, "Kcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); + struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + cb(kq_b, "kq_b", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); + kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + cb(v, "v", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - ggml_build_forward_expand(gf, cur); + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); - cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur); - cb(cur, "kqv_out", il); - } + ggml_build_forward_expand(gf, cur); - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); + cb(cur, "kqv_out", il); + } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_enc, NULL, NULL, - model.layers[il].ffn_gate_enc, NULL, NULL, - model.layers[il].ffn_down_enc, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm_enc, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_add(ctx0, cur, ffn_inp); + // T5 uses relu, flan-T5 uses gelu-gated + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up_enc, NULL, NULL, + model.layers[il].ffn_gate_enc, NULL, NULL, + model.layers[il].ffn_down_enc, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + cb, il); cb(cur, "ffn_out", il); + } - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); - // input for next layer - inpL = cur; + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); } + cb(cur, "l_out", il); - cur = inpL; - cb(cur, "result_embd", -1); + // input for next layer + inpL = cur; + } - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm_enc, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - } else { - GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + cur = inpL; + cb(cur, "result_embd", -1); - struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); - struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm_enc, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); - struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); + ggml_build_forward_expand(gf, cur); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + return gf; + } - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + struct ggml_cgraph * build_t5_decoder() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - // self-attention - { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); + GGML_ASSERT(!lctx.is_encoding); + GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - struct ggml_tensor * v = - ggml_view_3d(ctx0, kv_self.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self.v_l[il])*n_ctx, - ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); + struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); + struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); + struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // self-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - cb(kqv, "kqv", il); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + 0); + cb(k, "k", il); - ggml_build_forward_expand(gf, cur); + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv_self.v_l[il])*n_ctx, + ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + 0); + cb(v, "v", il); - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "kqv_out", il); - } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "cross_inp", il); + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * inpCA = cur; + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - // norm - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].attn_norm_cross, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm_cross", il); + struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); + struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + cb(kq_b, "kq_b", il); - // cross-attention - { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur); - cb(Qcur, "Qcur", il); + kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc); - cb(Kcur, "Kcur", il); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + cb(kqv, "kqv", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc); - cb(Vcur, "Vcur", il); + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + ggml_build_forward_expand(gf, cur); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cb(cur, "kqv_out", il); + } - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "cross_inp", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - cb(v, "v", il); + struct ggml_tensor * inpCA = cur; - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // norm + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_norm_cross, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm_cross", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // cross-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); + cb(Qcur, "Qcur", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); + cb(Kcur, "Kcur", il); - ggml_build_forward_expand(gf, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); + cb(Vcur, "Vcur", il); - cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur); - cb(cur, "kqv_out", il); - } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); - } + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); - cb(ffn_inp, "ffn_inp", il); + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + cb(v, "v", il); - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - // input for next layer - inpL = cur; + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); + + ggml_build_forward_expand(gf, cur); + + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); + cb(cur, "kqv_out", il); } - cur = inpL; - cb(cur, "result_embd", -1); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); + } - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // T5 uses relu, flan-T5 uses gelu-gated + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); + // input for next layer + inpL = cur; } + cur = inpL; + cb(cur, "result_embd", -1); + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); return gf; } struct ggml_cgraph * build_jais() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -13643,7 +13773,7 @@ struct llm_build_context { } struct ggml_cgraph * build_chatglm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -13996,14 +14126,22 @@ static struct ggml_cgraph * llama_build_graph( } break; case LLM_ARCH_T5: { - result = llm.build_t5(); + if (lctx.is_encoding) { + result = llm.build_t5_encoder(); + } else { + result = llm.build_t5_decoder(); + } + } break; + case LLM_ARCH_T5ENCODER: + { + result = llm.build_t5_encoder(); } break; case LLM_ARCH_JAIS: { result = llm.build_jais(); } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // add on pooling layer @@ -14715,12 +14853,15 @@ static int llama_decode_internal( res = nullptr; embd = nullptr; } else if (cparams.embeddings) { - res = nullptr; // do not extract logits for embedding case - embd = gf->nodes[gf->n_nodes - 1]; - if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = gf->nodes[gf->n_nodes - 2]; + res = nullptr; // do not extract logits for embedding case + embd = nullptr; + for (int i = gf->n_nodes - 1; i >= 0; --i) { + if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { + embd = gf->nodes[i]; + break; + } } - GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + GGML_ASSERT(embd != nullptr && "missing embeddings tensor"); } else { embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); @@ -14802,8 +14943,8 @@ static int llama_decode_internal( } break; case LLAMA_POOLING_TYPE_UNSPECIFIED: { - GGML_ASSERT(false && "unknown pooling type"); - } break; + GGML_ABORT("unknown pooling type"); + } } } n_outputs_prev += lctx.n_outputs; @@ -14927,9 +15068,24 @@ static int llama_encode_internal( ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output embeddings after the final encoder normalization - struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embd = nullptr; - GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); + // there are two cases here + if (llama_model_has_decoder(&lctx.model)) { + // first case is an encoder-decoder T5 model where embeddings are passed to decoder + embd = gf->nodes[gf->n_nodes - 1]; + GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); + } else { + // second case is an encoder-only T5 model + if (cparams.embeddings) { + // only output embeddings if required + embd = gf->nodes[gf->n_nodes - 1]; + if (strcmp(embd->name, "result_embd_pooled") != 0) { + embd = gf->nodes[gf->n_nodes - 2]; + } + GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + } + } ggml_backend_sched_alloc_graph(lctx.sched, gf); @@ -14942,20 +15098,54 @@ static int llama_encode_internal( ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); GGML_ASSERT(backend_embd != nullptr); - // extract token embeddings - GGML_ASSERT(lctx.embd != nullptr); + if (llama_model_has_decoder(&lctx.model)) { + lctx.embd_enc.resize(n_tokens*n_embd); + float * embd_out = lctx.embd_enc.data(); - lctx.embd_enc.resize(n_tokens*n_embd); - float * embd_out = lctx.embd_enc.data(); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + // remember the sequence ids used during the encoding - needed for cross attention later + lctx.seq_ids_enc.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < batch.n_seq_id[i]; s++) { + llama_seq_id seq_id = batch.seq_id[i][s]; + lctx.seq_ids_enc[i].insert(seq_id); + } + } + } else { + GGML_ASSERT(lctx.embd != nullptr); - // remember the sequence ids used during the encoding - needed for cross attention later - lctx.seq_ids_enc.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < batch.n_seq_id[i]; s++) { - llama_seq_id seq_id = batch.seq_id[i][s]; - lctx.seq_ids_enc[i].insert(seq_id); + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(lctx.embd != nullptr); + float * embd_out = lctx.embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = lctx.embd_seq; + embd_seq_out.clear(); + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } } } } @@ -14988,9 +15178,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // each move requires 6*n_layer tensors (see build_defrag) // - source view, destination view, copy operation // - x2 for keys and values - //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer); // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer); + const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer); // determine which KV cells to move where // @@ -15194,7 +15384,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA - GGML_ASSERT(false && "Deepseek2 does not support K-shift"); + GGML_ABORT("Deepseek2 does not support K-shift"); } { @@ -15333,7 +15523,7 @@ static void llama_tensor_dequantize_internal( } else if (ggml_is_quantized(tensor->type)) { qtype.to_float(tensor->data, f32_output, nelements); } else { - GGML_ASSERT(false); // unreachable + GGML_ABORT("fatal error"); // unreachable } return; } @@ -15391,7 +15581,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { if (n_expert > 1) { - // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work // for getting the current layer as I initially thought, and we need to resort to parsing the // tensor name. @@ -16706,6 +16896,8 @@ struct llama_context * llama_new_context_with_model( ctx->sampling.rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; + // build worst-case graph for encoder if a model contains encoder + ctx->is_encoding = llama_model_has_encoder(model); uint32_t kv_size = cparams.n_ctx; ggml_type type_k = params.type_k; @@ -16921,8 +17113,10 @@ struct llama_context * llama_new_context_with_model( } } + const size_t max_nodes = llama_model_max_nodes(*model); + // buffer used to store the computation graph and the tensor meta data - ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false)); + ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = @@ -16935,7 +17129,7 @@ struct llama_context * llama_new_context_with_model( // currently this is only implemented in the CUDA backend pipeline_parallel = false; #endif - ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel); + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel); if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); @@ -17018,6 +17212,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MAMBA: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: return LLAMA_ROPE_TYPE_NONE; @@ -17060,8 +17255,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: - GGML_ASSERT(false && "unknown architecture"); - break; + GGML_ABORT("unknown architecture"); } return LLAMA_ROPE_TYPE_NONE; @@ -17166,8 +17360,16 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch bool llama_model_has_encoder(const struct llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5: return true; - default: return false; + case LLM_ARCH_T5: return true; + case LLM_ARCH_T5ENCODER: return true; + default: return false; + } +} + +bool llama_model_has_decoder(const struct llama_model * model) { + switch (model->arch) { + case LLM_ARCH_T5ENCODER: return false; + default: return true; } } @@ -17443,18 +17645,18 @@ void llama_kv_cache_update(struct llama_context * ctx) { } // deprecated -size_t llama_get_state_size(const struct llama_context * ctx) { +size_t llama_get_state_size(struct llama_context * ctx) { return llama_state_get_size(ctx); } // deprecated size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { - return llama_state_get_data(ctx, dst); + return llama_state_get_data(ctx, dst, -1); } // deprecated size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { - return llama_state_set_data(ctx, src); + return llama_state_set_data(ctx, src, -1); } // deprecated @@ -17467,302 +17669,282 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi return llama_state_save_file(ctx, path_session, tokens, n_token_count); } -// Returns the *maximum* size of the state -size_t llama_state_get_size(const struct llama_context * ctx) { - const auto & cparams = ctx->cparams; - const auto & hparams = ctx->model.hparams; - - // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. - // for reference, std::mt19937(1337) serializes to 6701 bytes. - const size_t s_rng_size = sizeof(size_t); - const size_t s_rng = LLAMA_MAX_RNG_STATE; - const size_t s_n_outputs = sizeof(size_t); - // assume worst case for outputs although only currently set ones are serialized - const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t); - const size_t s_logits_size = sizeof(size_t); - const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0; - const size_t s_embedding_size = sizeof(size_t); - const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0; - const size_t s_kv_buf_size = sizeof(size_t); - const size_t s_kv_head = sizeof(uint32_t); - const size_t s_kv_size = sizeof(uint32_t); - const size_t s_kv_used = sizeof(uint32_t); - const size_t s_v_trans = sizeof(uint32_t); - const size_t s_kv = ctx->kv_self.total_size(); - const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id); - const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell; - - const size_t s_total = ( - + s_rng_size - + s_rng - + s_n_outputs - + s_output_pos - + s_logits_size - + s_logits - + s_embedding_size - + s_embedding - + s_kv_buf_size - + s_kv_head - + s_kv_size - + s_kv_used - + s_v_trans - + s_kv - + s_kv_cells - ); - - // on session change it is very likely that the state size has changed - so we need to update this function - static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?"); - - return s_total; -} - -// llama_context_data -struct llama_data_context { +// TODO: replace all non-fatal assertions with returned errors or exceptions +struct llama_data_write { virtual void write(const void * src, size_t size) = 0; + virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0; virtual size_t get_size_written() = 0; - virtual ~llama_data_context() = default; -}; - -struct llama_data_buffer_context : llama_data_context { - uint8_t * ptr; - size_t size_written = 0; + virtual ~llama_data_write() = default; - llama_data_buffer_context(uint8_t * p) : ptr(p) {} + void write_string(const std::string & str) { + uint32_t str_size = str.size(); - void write(const void * src, size_t size) override { - memcpy(ptr, src, size); - ptr += size; - size_written += size; + write(&str_size, sizeof(str_size)); + write(str.data(), str_size); } - size_t get_size_written() override { - return size_written; + void write_model_info(const struct llama_context * ctx) { + std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch); + write_string(arch_str); + // TODO: add more model-specific info which should prevent loading the session file if not identical } -}; -struct llama_data_file_context : llama_data_context { - llama_file * file; - size_t size_written = 0; + void write_rng(const std::mt19937 & rng) { + std::ostringstream rng_ss; + rng_ss << rng; - llama_data_file_context(llama_file * f) : file(f) {} + const std::string & rng_str = rng_ss.str(); - void write(const void * src, size_t size) override { - file->write_raw(src, size); - size_written += size; + write_string(rng_str); } - size_t get_size_written() override { - return size_written; - } -}; + void write_output_ids(const struct llama_context * ctx) { + const uint32_t n_outputs = ctx->n_outputs; -/** copy state data into either a buffer or file depending on the passed in context - * - * file context: - * llama_file file("/path", "wb"); - * llama_data_file_context data_ctx(&file); - * llama_state_get_data(ctx, &data_ctx); - * - * buffer context: - * std::vector<uint8_t> buf(max_size, 0); - * llama_data_buffer_context data_ctx(&buf.data()); - * llama_state_get_data(ctx, &data_ctx); - * -*/ -static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { - llama_synchronize(ctx); + std::vector<int32_t> output_pos; - // copy rng - { - std::ostringstream rng_ss; - rng_ss << ctx->sampling.rng; + const size_t n_batch = ctx->cparams.n_batch; + const auto & output_ids = ctx->output_ids; - const std::string & rng_str = rng_ss.str(); - const size_t rng_size = rng_str.size(); + GGML_ASSERT(n_outputs <= ctx->output_size); - GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); + output_pos.resize(n_outputs); - data_ctx->write(&rng_size, sizeof(rng_size)); - data_ctx->write(rng_str.data(), rng_size); + // build a more compact representation of the output ids + for (size_t i = 0; i < n_batch; ++i) { + // map an output id to a position in the batch + int32_t pos = output_ids[i]; + if (pos >= 0) { + GGML_ASSERT((uint32_t) pos < n_outputs); + output_pos[pos] = i; + } + } + + write(&n_outputs, sizeof(n_outputs)); + + if (n_outputs) { + write(output_pos.data(), n_outputs * sizeof(int32_t)); + } } - // copy outputs - { - // Can't use ctx->n_outputs because it's not for the - // entire last batch when n_ubatch is smaller than n_batch - size_t n_outputs = 0; + void write_logits(const struct llama_context * ctx) { + const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab); - // copy output ids - { - std::vector<int32_t> output_pos; + write(&logits_size, sizeof(logits_size)); + + if (logits_size) { + write(ctx->logits, logits_size * sizeof(float)); + } + } + + void write_embeddings(const struct llama_context * ctx) { + const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd); + + write(&embeddings_size, sizeof(embeddings_size)); + + if (embeddings_size) { + write(ctx->embd, embeddings_size * sizeof(float)); + } + } + + void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) { - const size_t n_batch = ctx->cparams.n_batch; - const auto & output_ids = ctx->output_ids; + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = kv_self.cells[i]; + const llama_pos pos = cell.pos; + const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; - output_pos.resize(ctx->output_size); + write(&pos, sizeof(pos)); + write(&n_seq_id, sizeof(n_seq_id)); - // build a more compact representation of the output ids - for (size_t i = 0; i < n_batch; ++i) { - // map an output id to a position in the batch - int32_t pos = output_ids[i]; - if (pos >= 0) { - if ((size_t) pos >= n_outputs) { - n_outputs = pos + 1; + if (n_seq_id) { + for (auto seq_id : cell.seq_id) { + write(&seq_id, sizeof(seq_id)); } - GGML_ASSERT((size_t) pos < ctx->output_size); - output_pos[pos] = i; } } + } + } - data_ctx->write(&n_outputs, sizeof(n_outputs)); + void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) { + const struct llama_kv_cache & kv_self = ctx->kv_self; + const struct llama_hparams & hparams = ctx->model.hparams; - if (n_outputs) { - data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t)); - } - } + const uint32_t v_trans = kv_self.v_trans ? 1 : 0; + const uint32_t n_layer = hparams.n_layer; - // copy logits - { - const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab); + write(&v_trans, sizeof(v_trans)); + write(&n_layer, sizeof(n_layer)); - data_ctx->write(&logits_size, sizeof(logits_size)); + std::vector<uint8_t> tmp_buf; - if (logits_size) { - data_ctx->write(ctx->logits, logits_size * sizeof(float)); - } - } + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - // copy embeddings - { - const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd); + // Write key type + const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + write(&k_type_i, sizeof(k_type_i)); - data_ctx->write(&embeddings_size, sizeof(embeddings_size)); + // Write row size of key + const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + write(&k_size_row, sizeof(k_size_row)); - if (embeddings_size) { - data_ctx->write(ctx->embd, embeddings_size * sizeof(float)); + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * k_size_row; + write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); } } - } - // copy kv cache - { - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - // NOTE: kv_size and kv_buf_size are mostly used for sanity checks - const uint32_t kv_head = llama_kv_cache_cell_max(kv_self); - const uint32_t kv_size = kv_self.size; - const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head; - const uint32_t kv_used = kv_self.used; - const uint32_t v_trans = kv_self.v_trans ? 1 : 0; - - data_ctx->write(&kv_buf_size, sizeof(kv_buf_size)); - data_ctx->write(&kv_head, sizeof(kv_head)); - data_ctx->write(&kv_size, sizeof(kv_size)); - data_ctx->write(&kv_used, sizeof(kv_used)); - data_ctx->write(&v_trans, sizeof(v_trans)); - - if (kv_buf_size) { - const size_t pre_kv_buf_size = data_ctx->get_size_written(); - - std::vector<uint8_t> tmp_buf; - for (int il = 0; il < (int) n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + if (!kv_self.v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); - - tmp_buf.resize(k_size); - ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); + // Write value type + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + write(&v_type_i, sizeof(v_type_i)); - if (kv_self.recurrent || !kv_self.v_trans) { - // v is contiguous for recurrent models - // TODO: use other tensors for state models than k and v - const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); + // Write row size of value + const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + write(&v_size_row, sizeof(v_size_row)); - tmp_buf.resize(v_size); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); - continue; + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * v_size_row; + write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); } + } + } else { + // When v is transposed, we also need the element size and get the element ranges from each row + const uint32_t kv_size = kv_self.size; + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + write(&v_type_i, sizeof(v_type_i)); - // v is not contiguous, copy row by row - const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size); + // Write element size + const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + write(&v_size_el, sizeof(v_size_el)); - tmp_buf.resize(v_row_size); - for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); + // Write GQA embedding size + write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); + + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + const size_t buf_size = range_size * v_size_el; + write_tensor_data(kv_self.v_l[il], src_offset, buf_size); + } } } - GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size); } + } - for (uint32_t i = 0; i < kv_head; ++i) { - const auto & cell = kv_self.cells[i]; - - const llama_pos pos = cell.pos; - const size_t seq_id_size = cell.seq_id.size(); - - data_ctx->write(&pos, sizeof(pos)); - data_ctx->write(&seq_id_size, sizeof(seq_id_size)); + void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) { + const struct llama_kv_cache & kv_self = ctx->kv_self; + std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; - for (auto seq_id : cell.seq_id) { - data_ctx->write(&seq_id, sizeof(seq_id)); + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) + uint32_t cell_range_begin = kv_self.size; + for (uint32_t i = 0; i < kv_self.size; ++i) { + const auto & cell = kv_self.cells[i]; + if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { + ++cell_count; + if (cell_range_begin == kv_self.size) { + cell_range_begin = i; + } + } else { + if (cell_range_begin != kv_self.size) { + cell_ranges.emplace_back(cell_range_begin, i); + cell_range_begin = kv_self.size; + } } } - } -} + if (cell_range_begin != kv_self.size) { + cell_ranges.emplace_back(cell_range_begin, kv_self.size); + } -size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) { - llama_data_buffer_context data_ctx(dst); - llama_state_get_data_internal(ctx, &data_ctx); + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cell_ranges) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); - return data_ctx.get_size_written(); -} + write(&cell_count, sizeof(cell_count)); -// Sets the state reading from the specified source address -size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { - llama_synchronize(ctx); + write_kv_cache_meta(kv_self, cell_ranges, seq_id); + write_kv_cache_data(ctx, cell_ranges); + } +}; - const uint8_t * inp = src; +struct llama_data_read { + virtual const uint8_t * read(size_t size) = 0; + virtual void read_to(void * dst, size_t size) = 0; + virtual size_t get_size_read() = 0; + virtual ~llama_data_read() = default; - // set rng - { - size_t rng_size; - memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); + void read_string(std::string & str) { + uint32_t str_size; + read_to(&str_size, sizeof(str_size)); + + str.assign((const char *) read(str_size), str_size); + } - GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); + // validate model information + void read_model_info(const struct llama_context * ctx) { + std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch); + std::string arch_str; + read_string(arch_str); + if (cur_arch_str != arch_str) { + throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); + } + // TODO: add more info which needs to be identical but which is not verified otherwise + } - std::string rng_str((const char *)inp, rng_size); inp += rng_size; + void read_rng(std::mt19937 & rng) { + std::string rng_str; + read_string(rng_str); std::istringstream rng_ss(rng_str); - rng_ss >> ctx->sampling.rng; + rng_ss >> rng; - GGML_ASSERT(!rng_ss.fail()); + if (rng_ss.fail()) { + throw std::runtime_error("failed to load RNG state"); + } } - // set output ids - { - size_t n_outputs; + void read_output_ids(struct llama_context * ctx) { std::vector<int32_t> output_pos; - memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs); + uint32_t n_outputs; + read_to(&n_outputs, sizeof(n_outputs)); - GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs)); + if (n_outputs > llama_output_reserve(*ctx, n_outputs)) { + throw std::runtime_error("could not reserve outputs"); + } if (n_outputs) { output_pos.resize(n_outputs); - memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t)); - inp += n_outputs * sizeof(int32_t); + read_to(output_pos.data(), n_outputs * sizeof(int32_t)); for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { int32_t id = output_pos[i]; - GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch); + if ((uint32_t) id >= ctx->cparams.n_batch) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch)); + } ctx->output_ids[id] = i; } @@ -17770,611 +17952,571 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { } } - // set logits - { - size_t logits_size; + void read_logits(struct llama_context * ctx) { + uint64_t logits_size; + read_to(&logits_size, sizeof(logits_size)); - memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - - GGML_ASSERT(ctx->logits_size >= logits_size); + if (ctx->logits_size < logits_size) { + throw std::runtime_error("logits buffer too small"); + } if (logits_size) { - memcpy(ctx->logits, inp, logits_size * sizeof(float)); - inp += logits_size * sizeof(float); + read_to(ctx->logits, logits_size * sizeof(float)); } } - // set embeddings - { - size_t embeddings_size; - - memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size); + void read_embeddings(struct llama_context * ctx) { + uint64_t embeddings_size; + read_to(&embeddings_size, sizeof(embeddings_size)); - GGML_ASSERT(ctx->embd_size >= embeddings_size); + if (ctx->embd_size < embeddings_size) { + throw std::runtime_error("embeddings buffer too small"); + } if (embeddings_size) { - memcpy(ctx->embd, inp, embeddings_size * sizeof(float)); - inp += embeddings_size * sizeof(float); + read_to(ctx->embd, embeddings_size * sizeof(float)); } } - // set kv cache - { - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; + bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { + struct llama_kv_cache & kv_self = ctx->kv_self; - const uint32_t n_layer = hparams.n_layer; + if (dest_seq_id != -1) { + // single sequence - size_t kv_buf_size; - uint32_t kv_head; - uint32_t kv_size; - uint32_t kv_used; - uint32_t v_trans; + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size); - memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head); - memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); - memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); - memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans); + llama_batch batch = llama_batch_init(cell_count, 0, 1); + batch.n_tokens = cell_count; + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + uint32_t n_seq_id; - GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition + read_to(&pos, sizeof(pos)); + read_to(&n_seq_id, sizeof(n_seq_id)); - if (kv_self.size != kv_size) { - // the KV cache needs to be big enough to load all the KV cells from the saved state - GGML_ASSERT(kv_self.size >= kv_head); + if (n_seq_id != 0) { + LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); + return false; + } - LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n", - __func__, kv_head, kv_size, kv_self.size); - } + batch.pos[i] = pos; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = dest_seq_id; + } + if (!llama_kv_cache_find_slot(kv_self, batch)) { + llama_batch_free(batch); + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return false; + } - llama_kv_cache_clear(ctx); + // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); + GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); + GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); + GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); - if (kv_buf_size) { - const size_t pre_kv_buf_size = inp - src; + // Cleanup + llama_batch_free(batch); + } else { + // whole KV cache restore - GGML_ASSERT(kv_self.total_size() >= kv_buf_size); + if (cell_count > kv_self.size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); + return false; + } - for (int il = 0; il < (int) n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + llama_kv_cache_clear(kv_self); - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); + for (uint32_t i = 0; i < cell_count; ++i) { + llama_kv_cell & cell = kv_self.cells[i]; - ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); - inp += k_size; + llama_pos pos; + uint32_t n_seq_id; - if (kv_self.recurrent || !kv_self.v_trans) { - // v is contiguous for recurrent models - // TODO: use other tensors for state models than k and v - const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); + read_to(&pos, sizeof(pos)); + read_to(&n_seq_id, sizeof(n_seq_id)); - ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size); - inp += v_size; - continue; - } + cell.pos = pos; + + for (uint32_t j = 0; j < n_seq_id; ++j) { + llama_seq_id seq_id; + read_to(&seq_id, sizeof(seq_id)); - // v is not contiguous, copy row by row - const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size); + if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + return false; + } - for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { - ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); - inp += v_row_size; + cell.seq_id.insert(seq_id); } } - GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size); + + kv_self.head = 0; + kv_self.used = cell_count; } - ctx->kv_self.head = kv_head; - ctx->kv_self.used = kv_used; + return true; + } - for (uint32_t i = 0; i < kv_head; ++i) { - llama_pos pos; - size_t seq_id_size; + bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) { + const struct llama_hparams & hparams = ctx->model.hparams; + struct llama_kv_cache & kv_self = ctx->kv_self; + uint32_t v_trans; + uint32_t n_layer; + read_to(&v_trans, sizeof(v_trans)); + read_to(&n_layer, sizeof(n_layer)); - memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos); - memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size); + if (n_layer != hparams.n_layer) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + return false; + } + if (cell_count > kv_self.size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size); + return false; + } + if (kv_self.v_trans != (bool) v_trans) { + LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); + return false; + } - ctx->kv_self.cells[i].pos = pos; + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - llama_seq_id seq_id; + // Read type of key + int32_t k_type_i_ref; + read_to(&k_type_i_ref, sizeof(k_type_i_ref)); + const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + if (k_type_i != k_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return false; + } + + // Read row size of key + uint64_t k_size_row_ref; + read_to(&k_size_row_ref, sizeof(k_size_row_ref)); + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); + return false; + } - for (size_t j = 0; j < seq_id_size; ++j) { - memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id); - ctx->kv_self.cells[i].seq_id.insert(seq_id); + if (cell_count) { + // Read and set the keys for the whole cell range + ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row); } } - } - const size_t nread = inp - src; - const size_t max_size = llama_state_get_size(ctx); + if (!kv_self.v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - GGML_ASSERT(nread <= max_size); + // Read type of value + int32_t v_type_i_ref; + read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } - return nread; -} + // Read row size of value + uint64_t v_size_row_ref; + read_to(&v_size_row_ref, sizeof(v_size_row_ref)); + const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); + return false; + } -static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(path_session, "rb"); + if (cell_count) { + // Read and set the values for the whole cell range + ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row); + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - // sanity checks - { - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); + // Read type of value + int32_t v_type_i_ref; + read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } - if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { - LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return false; - } + // Read element size of value + uint32_t v_size_el_ref; + read_to(&v_size_el_ref, sizeof(v_size_el_ref)); + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + if (v_size_el != v_size_el_ref) { + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); + return false; + } - llama_hparams session_hparams; - file.read_raw(&session_hparams, sizeof(llama_hparams)); + // Read GQA embedding size + uint32_t n_embd_v_gqa_ref; + read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); + return false; + } - if (session_hparams != ctx->model.hparams) { - LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__); - return false; + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el; + ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } + } + } } + return true; } - // load the prompt - { - const uint32_t n_token_count = file.read_u32(); + void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) { + uint32_t cell_count; + read_to(&cell_count, sizeof(cell_count)); - if (n_token_count > n_token_capacity) { - LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); - return false; - } + bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count); - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; + if (!res) { + if (seq_id == -1) { + llama_kv_cache_clear(ctx); + } else { + llama_kv_cache_seq_rm(ctx, seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); + } } +}; - // restore the context state - { - const size_t n_state_size_cur = file.size - file.tell(); - const size_t n_state_size_max = llama_state_get_size(ctx); - - if (n_state_size_cur > n_state_size_max) { - LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); - return false; - } +struct llama_data_write_dummy : llama_data_write { + size_t size_written = 0; - std::vector<uint8_t> state_data(n_state_size_max); - file.read_raw(state_data.data(), n_state_size_cur); + llama_data_write_dummy() {} - llama_state_set_data(ctx, state_data.data()); + void write(const void * /* src */, size_t size) override { + size_written += size; } - return true; -} - -bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - try { - return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading session file: %s\n", err.what()); - return false; + void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { + size_written += size; } -} - -static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - llama_file file(path_session, "wb"); - file.write_u32(LLAMA_SESSION_MAGIC); - file.write_u32(LLAMA_SESSION_VERSION); - - file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); - - // save the prompt - file.write_u32((uint32_t) n_token_count); - file.write_raw(tokens, sizeof(llama_token) * n_token_count); - - // save the context state using stream saving - llama_data_file_context data_ctx(&file); - llama_state_get_data_internal(ctx, &data_ctx); - - return true; -} - -bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - try { - return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error saving session file: %s\n", err.what()); - return false; + size_t get_size_written() override { + return size_written; } -} - -size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) { - // save the size of size_t as a uint32_t for safety check - const size_t size_t_size_size = sizeof(uint32_t); +}; - // other values - const size_t s_cell_count_size = sizeof(uint32_t); - const size_t s_layer_count_size = sizeof(uint32_t); - const size_t n_embd_v_gqa_size = sizeof(uint32_t); +struct llama_data_write_buffer : llama_data_write { + uint8_t * ptr; + size_t buf_size = 0; + size_t size_written = 0; - size_t s_cell_count = 0; - size_t s_cell_data_size = 0; - const auto & kv_self = ctx->kv_self; - const auto & hparams = ctx->model.hparams; + llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {} - const uint32_t n_layer = hparams.n_layer; + void write(const void * src, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + memcpy(ptr, src, size); + ptr += size; + size_written += size; + buf_size -= size; + } - for (uint32_t i = 0; i < kv_self.size; ++i) { - const auto & cell = kv_self.cells[i]; - if (cell.seq_id.count(seq_id) > 0) { - ++s_cell_count; - s_cell_data_size += sizeof(llama_pos); + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); } + ggml_backend_tensor_get(tensor, ptr, offset, size); + ptr += size; + size_written += size; + buf_size -= size; } - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + size_t get_size_written() override { + return size_written; + } +}; - // types of keys and values - s_cell_data_size += sizeof(int32_t) * 2; - // k_size_row and v_size_el values of layer - s_cell_data_size += sizeof(size_t) * 2; +struct llama_data_read_buffer : llama_data_read { + const uint8_t * ptr; + size_t buf_size = 0; + size_t size_read = 0; - // keys - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - s_cell_data_size += k_size_row * s_cell_count; + llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {} - // values (transposed) - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa; + const uint8_t * read(size_t size) override { + const uint8_t * base_ptr = ptr; + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ptr += size; + size_read += size; + buf_size -= size; + return base_ptr; } - const size_t s_total = ( - size_t_size_size + - s_cell_count_size + - s_layer_count_size + - n_embd_v_gqa_size + - s_cell_data_size - ); - - return s_total; -} - -static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) { - llama_synchronize(ctx); + void read_to(void * dst, size_t size) override { + memcpy(dst, read(size), size); + } - const auto & kv_self = ctx->kv_self; - GGML_ASSERT(!kv_self.recurrent); // not implemented + size_t get_size_read() override { + return size_read; + } +}; - // Save the size of size_t as a uint32_t for safety check - const uint32_t size_t_size = sizeof(size_t); - data_ctx.write(&size_t_size, sizeof(size_t_size)); +struct llama_data_write_file : llama_data_write { + llama_file * file; + size_t size_written = 0; + std::vector<uint8_t> temp_buffer; - std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive - uint32_t cell_count = 0; + llama_data_write_file(llama_file * f) : file(f) {} - // Count the number of cells with the specified seq_id - // Find all the ranges of cells with this seq id - { - uint32_t cell_range_begin = kv_self.size; - for (uint32_t i = 0; i < kv_self.size; ++i) { - const auto & cell = kv_self.cells[i]; - if (cell.has_seq_id(seq_id)) { - ++cell_count; - if (cell_range_begin == kv_self.size) { - cell_range_begin = i; - } - } - else { - if (cell_range_begin != kv_self.size) { - cell_ranges.emplace_back(cell_range_begin, i); - cell_range_begin = kv_self.size; - } - } - } - if (cell_range_begin != kv_self.size) { - cell_ranges.emplace_back(cell_range_begin, kv_self.size); - } + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; + } - // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count - uint32_t cell_count_check = 0; - for (const auto & range : cell_ranges) { - cell_count_check += range.second - range.first; - } - GGML_ASSERT(cell_count == cell_count_check); + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + temp_buffer.resize(size); + ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); + write(temp_buffer.data(), temp_buffer.size()); } - // Write the cell count - data_ctx.write(&cell_count, sizeof(cell_count)); + size_t get_size_written() override { + return size_written; + } +}; - const auto & hparams = ctx->model.hparams; - const uint32_t n_layer = hparams.n_layer; +struct llama_data_read_file : llama_data_read { + llama_file * file; + size_t size_read = 0; + std::vector<uint8_t> temp_buffer; - // Write the layer count - data_ctx.write(&n_layer, sizeof(n_layer)); + llama_data_read_file(llama_file * f) : file(f) {} - // Write n_embd_v_gqa (reference value) - { - const uint32_t n_embd_v_gqa_ref = hparams.n_embd_v_gqa() + hparams.n_embd_k_s(); - data_ctx.write(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + void read_to(void * dst, size_t size) override { + file->read_raw(dst, size); + size_read += size; } - // Iterate the ranges and write all the pos (this is the token position in the prompt) - for (const auto & range : cell_ranges) { - for (uint32_t i = range.first; i < range.second; ++i) { - const auto & cell = kv_self.cells[i]; - data_ctx.write(&cell.pos, sizeof(cell.pos)); - } + const uint8_t * read(size_t size) override { + temp_buffer.resize(size); + read_to(temp_buffer.data(), size); + return temp_buffer.data(); } - // Iterate and write all the keys first, each row is a cell - // Get whole range at a time - std::vector<uint8_t> tmp_buf; - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + size_t get_size_read() override { + return size_read; + } +}; - // Write key type - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; - data_ctx.write(&k_type_i, sizeof(k_type_i)); +/** copy state data into either a buffer or file depending on the passed in context + * + * file context: + * llama_file file("/path", "wb"); + * llama_data_write_file data_ctx(&file); + * llama_state_get_data_internal(ctx, data_ctx); + * + * buffer context: + * std::vector<uint8_t> buf(max_size, 0); + * llama_data_write_buffer data_ctx(buf.data(), max_size); + * llama_state_get_data_internal(ctx, data_ctx); + * +*/ +static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) { + llama_synchronize(ctx); - // Write row size of key - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - data_ctx.write(&k_size_row, sizeof(k_size_row)); + data_ctx.write_model_info(ctx); - // Read each range of cells of k_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * k_size_row); - ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); - } - } + data_ctx.write_rng(ctx->sampling.rng); - // TODO: simplify, reduce copy-paste - if (!kv_self.v_trans) { - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + // copy outputs + data_ctx.write_output_ids(ctx); + data_ctx.write_logits(ctx); + data_ctx.write_embeddings(ctx); - // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - data_ctx.write(&v_type_i, sizeof(v_type_i)); + data_ctx.write_kv_cache(ctx); - // Write row size of value - const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); - data_ctx.write(&v_size_row, sizeof(v_size_row)); + return data_ctx.get_size_written(); +} - // Read each range of cells of v_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * v_size_row); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); - } - } - } else { - // For the values, they are transposed, so we also need the element size and get the element ranges from each row - const uint32_t kv_size = kv_self.size; - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - data_ctx.write(&v_type_i, sizeof(v_type_i)); - - // Write element size - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - data_ctx.write(&v_size_el, sizeof(v_size_el)); - - // For each row, we get the element values of each cell - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t src_offset = (range.first + j * kv_size) * v_size_el; - tmp_buf.resize(range_size * v_size_el); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); - data_ctx.write(tmp_buf.data(), tmp_buf.size()); - } - } - } +size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) { + llama_data_write_buffer data_ctx(dst, size); + try { + return llama_state_get_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); + return 0; } - - return data_ctx.get_size_written(); } -size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) { - llama_data_buffer_context data_ctx(dst); - return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); +// Returns the *actual* size of the state. +// Intended to be used when saving to state to a buffer. +size_t llama_state_get_size(struct llama_context * ctx) { + llama_data_write_dummy data_ctx; + try { + return llama_state_get_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); + return 0; + } } -size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) { +static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) { llama_synchronize(ctx); - auto & kv_self = ctx->kv_self; - GGML_ASSERT(!kv_self.recurrent); // not implemented + data_ctx.read_model_info(ctx); + + // set rng + data_ctx.read_rng(ctx->sampling.rng); - // Wipe the slot - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + // set outputs + data_ctx.read_output_ids(ctx); + data_ctx.read_logits(ctx); + data_ctx.read_embeddings(ctx); - const uint8_t * inp = src; + data_ctx.read_kv_cache(ctx); - // Read size of size_t - uint32_t size_t_size; - memcpy(&size_t_size, inp, sizeof(size_t_size)); - inp += sizeof(size_t_size); - if (size_t_size != sizeof(size_t)) { - LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__); + return data_ctx.get_size_read(); +} + +// Sets the state reading from the specified source address +size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) { + llama_data_read_buffer data_ctx(src, size); + try { + return llama_state_set_data_internal(ctx, data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); return 0; } +} - // Read the cell count - uint32_t cell_count; - memcpy(&cell_count, inp, sizeof(cell_count)); - inp += sizeof(cell_count); +static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(path_session, "rb"); - // Read the layer count - uint32_t n_layer_ref; - memcpy(&n_layer_ref, inp, sizeof(n_layer_ref)); - inp += sizeof(n_layer_ref); + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); - // Read n_embd_v_gqa - uint32_t n_embd_v_gqa_ref; - memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref)); - inp += sizeof(n_embd_v_gqa_ref); + if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { + LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; + } + } - // Sanity check model compatibility - const auto & hparams = ctx->model.hparams; - const uint32_t n_layer = hparams.n_layer; + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); - if (n_layer != n_layer_ref) { - LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref); - return 0; - } + if (n_token_count > n_token_capacity) { + LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; + } - if (hparams.n_embd_v_gqa() != n_embd_v_gqa_ref) { - LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, hparams.n_embd_v_gqa(), n_embd_v_gqa_ref); - return 0; + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; } - // Allocate the new cells for the slot - if (cell_count) { - llama_batch batch = llama_batch_init(cell_count, 0, 1); - batch.n_tokens = cell_count; - for (uint32_t i = 0; i < cell_count; ++i) { - llama_pos pos; - memcpy(&pos, inp, sizeof(pos)); - inp += sizeof(pos); + // restore the context state + { + const size_t n_state_size_cur = file.size - file.tell(); + + llama_data_read_file data_ctx(&file); + const size_t n_read = llama_state_set_data_internal(ctx, data_ctx); - batch.pos[i] = pos; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = dest_seq_id; - } - if (!llama_kv_cache_find_slot(kv_self, batch)) { - llama_batch_free(batch); - LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); - return 0; + if (n_read != n_state_size_cur) { + LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); + return false; } + } + return true; +} - // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) - // Assume that this is one contiguous block of cells - GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); - GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); - GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); - - // Cleanup - llama_batch_free(batch); +bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + try { + return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what()); + return false; } +} - const uint32_t kv_size = kv_self.size; - const uint32_t kv_head = kv_self.head; - - // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - - // Read type of key - int32_t k_type_i_ref; - memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref)); - inp += sizeof(k_type_i_ref); - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; - if (k_type_i != k_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); - return 0; - } +static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + llama_file file(path_session, "wb"); - // Read row size of key - size_t k_size_row_ref; - memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref)); - inp += sizeof(k_size_row_ref); - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - if (k_size_row != k_size_row_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il); - return 0; - } + file.write_u32(LLAMA_SESSION_MAGIC); + file.write_u32(LLAMA_SESSION_VERSION); - if (cell_count) { - // Read and set the keys for the whole cell range - ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row); - inp += cell_count * k_size_row; - } + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state using stream saving + llama_data_write_file data_ctx(&file); + llama_state_get_data_internal(ctx, data_ctx); + + return true; +} + +bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + try { + return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what()); + return false; } +} - // TODO: simplify, reduce copy-paste - if (!kv_self.v_trans) { - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); +static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) { + llama_synchronize(ctx); - // Read type of value - int32_t v_type_i_ref; - memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref)); - inp += sizeof(v_type_i_ref); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return 0; - } + data_ctx.write_kv_cache(ctx, seq_id); - // Read row size of value - size_t v_size_row_ref; - memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref)); - inp += sizeof(v_size_row_ref); - const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); - if (v_size_row != v_size_row_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il); - return 0; - } + return data_ctx.get_size_written(); +} - if (cell_count) { - // Read and set the values for the whole cell range - ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row); - inp += cell_count * v_size_row; - } - } - } else { - // For each layer, read the values for each cell (transposed) - for (int il = 0; il < (int)n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref)); - inp += sizeof(v_type_i_ref); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return 0; - } - - // Read element size of value - size_t v_size_el_ref; - memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref)); - inp += sizeof(v_size_el_ref); - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - if (v_size_el != v_size_el_ref) { - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); - LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il); - return 0; - } +size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) { + llama_data_write_dummy data_ctx; + return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); +} - if (cell_count) { - // For each row in the transposed matrix, read the values for the whole cell range - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (kv_head + j * kv_size) * v_size_el; - ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el); - inp += cell_count * v_size_el; - } - } - } +size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) { + llama_data_write_buffer data_ctx(dst, size); + try { + return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what()); + return 0; } +} - const size_t nread = inp - src; +static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) { + llama_synchronize(ctx); + + data_ctx.read_kv_cache(ctx, dest_seq_id); - return nread; + return data_ctx.get_size_read(); +} + +size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) { + llama_data_read_buffer data_ctx(src, size); + try { + return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what()); + return 0; + } } static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { @@ -18384,11 +18526,11 @@ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, con file.write_u32(LLAMA_STATE_SEQ_VERSION); // save the prompt - file.write_u32((uint32_t)n_token_count); + file.write_u32((uint32_t) n_token_count); file.write_raw(tokens, sizeof(llama_token) * n_token_count); // save the context state using stream saving - llama_data_file_context data_ctx(&file); + llama_data_write_file data_ctx(&file); llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); const size_t res = file.tell(); @@ -18426,9 +18568,8 @@ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, con // restore the context state { const size_t state_size = file.size - file.tell(); - std::vector<uint8_t> state_data(state_size); - file.read_raw(state_data.data(), state_size); - const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id); + llama_data_read_file data_ctx(&file); + const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id); if (!nread) { LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); return 0; @@ -18444,7 +18585,7 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa try { return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count); } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what()); return 0; } } @@ -18453,7 +18594,7 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa try { return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what()); return 0; } } @@ -18625,7 +18766,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif return nullptr; } @@ -18670,7 +18811,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif return nullptr; } |