summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/llama-grammar.cpp4
-rw-r--r--src/llama-impl.h15
-rw-r--r--src/llama-vocab.cpp40
-rw-r--r--src/llama-vocab.h2
-rw-r--r--src/llama.cpp2213
5 files changed, 1213 insertions, 1061 deletions
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index bd9322e2..b123d733 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -221,7 +221,7 @@ static void llama_grammar_advance_stack(
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
// those
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -517,7 +517,7 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc
return;
}
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const std::string & piece = vocab->cache_token_to_piece.at(token);
diff --git a/src/llama-impl.h b/src/llama-impl.h
index dcc8c1c1..399b134a 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -24,3 +24,18 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+
+//
+// helpers
+//
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+ if (search.empty()) {
+ return; // Avoid infinite loop if 'search' is an empty string
+ }
+ size_t pos = 0;
+ while ((pos = s.find(search, pos)) != std::string::npos) {
+ s.replace(pos, search.length(), replace);
+ pos += replace.length();
+ }
+}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index c482b368..749f8571 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -16,20 +16,6 @@
// helpers
//
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
- std::string result;
- for (size_t pos = 0; ; pos += search.length()) {
- auto new_pos = s.find(search, pos);
- if (new_pos == std::string::npos) {
- result += s.substr(pos, s.size() - pos);
- break;
- }
- result += s.substr(pos, new_pos - pos) + replace;
- pos = new_pos;
- }
- s = std::move(result);
-}
-
LLAMA_ATTRIBUTE_FORMAT(1, 2)
static std::string format(const char * fmt, ...) {
va_list ap;
@@ -152,14 +138,14 @@ static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
return strtol(buf.c_str(), NULL, 16);
}
case LLAMA_VOCAB_TYPE_BPE: {
- GGML_ASSERT(false);
- return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
+ GGML_ABORT("fatal error");
+ //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
}
case LLAMA_VOCAB_TYPE_WPM: {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -816,6 +802,9 @@ struct llm_tokenizer_ugm {
* the best tokenization.
*/
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+ // get current size of output (for reversal later)
+ size_t output_size = output.size();
+
// normalize the input first
std::string normalized;
normalize(text, &normalized);
@@ -895,7 +884,7 @@ struct llm_tokenizer_ugm {
}
// reverse the output since we added tokens starting from the end of the input
- std::reverse(output.begin(), output.end());
+ std::reverse(output.begin() + output_size, output.end());
}
private:
@@ -1396,7 +1385,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
}
} break;
case LLAMA_VOCAB_TYPE_NONE:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
return output;
@@ -1422,7 +1411,7 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
}
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -1444,7 +1433,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
return token != -1 && (
token == llama_token_eos_impl(vocab) ||
- token == llama_token_eot_impl(vocab)
+ token == llama_token_eot_impl(vocab) ||
+ token == llama_token_eom_impl(vocab)
);
}
@@ -1500,6 +1490,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id;
}
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+ return vocab.special_eom_id;
+}
+
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
@@ -1606,7 +1600,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
break;
}
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 30b565d5..7adfc16d 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -45,6 +45,7 @@ struct llama_vocab {
id special_suffix_id = -1;
id special_middle_id = -1;
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
+ id special_eom_id = -1;
// tokenizer flags
bool tokenizer_add_space_prefix = false;
@@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
diff --git a/src/llama.cpp b/src/llama.cpp
index 71be05f9..dce58dfe 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -102,7 +102,6 @@
#endif
// bump if necessary
-#define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_LAYERS 512
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
@@ -123,20 +122,6 @@ static std::string trim(const std::string & str) {
return str.substr(start, end - start);
}
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
- std::string result;
- for (size_t pos = 0; ; pos += search.length()) {
- auto new_pos = s.find(search, pos);
- if (new_pos == std::string::npos) {
- result += s.substr(pos, s.size() - pos);
- break;
- }
- result += s.substr(pos, new_pos - pos) + replace;
- pos = new_pos;
- }
- s = std::move(result);
-}
-
static bool is_float_close(float a, float b, float abs_tol) {
// Check for non-negative tolerance
if (abs_tol < 0.0) {
@@ -224,6 +209,7 @@ enum llm_arch {
LLM_ARCH_CHATGLM,
LLM_ARCH_BITNET,
LLM_ARCH_T5,
+ LLM_ARCH_T5ENCODER,
LLM_ARCH_JAIS,
LLM_ARCH_UNKNOWN,
};
@@ -268,6 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" },
+ { LLM_ARCH_T5ENCODER, "t5encoder" },
{ LLM_ARCH_JAIS, "jais" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -364,6 +351,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_SUFFIX_ID,
LLM_KV_TOKENIZER_MIDDLE_ID,
LLM_KV_TOKENIZER_EOT_ID,
+ LLM_KV_TOKENIZER_EOM_ID,
LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA,
@@ -461,6 +449,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
+ { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
@@ -1208,6 +1197,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
},
},
{
+ LLM_ARCH_CHATGLM,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ },
+ },
+ {
LLM_ARCH_BITNET,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
@@ -1261,6 +1265,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
},
},
{
+ LLM_ARCH_T5ENCODER,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
+ { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
+ { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
+ { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
+ { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
+ { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
+ { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
+ { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
+ { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
+ { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
+ { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
+ },
+ },
+ {
LLM_ARCH_JAIS,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
@@ -2245,8 +2267,7 @@ struct llama_hparams {
return n_head_arr[il];
}
- GGML_ASSERT(false);
- return 0;
+ GGML_ABORT("fatal error");
}
uint32_t n_head_kv(uint32_t il = 0) const {
@@ -2254,8 +2275,7 @@ struct llama_hparams {
return n_head_kv_arr[il];
}
- GGML_ASSERT(false);
- return 0;
+ GGML_ABORT("fatal error");
}
uint32_t n_ff(uint32_t il = 0) const {
@@ -2263,8 +2283,7 @@ struct llama_hparams {
return n_ff_arr[il];
}
- GGML_ASSERT(false);
- return 0;
+ GGML_ABORT("fatal error");
}
uint32_t n_gqa(uint32_t il = 0) const {
@@ -2441,6 +2460,7 @@ struct llama_layer {
// long rope factors
struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr;
+ struct ggml_tensor * rope_freqs = nullptr;
// bitnet scale
struct ggml_tensor * wq_scale;
@@ -2922,7 +2942,7 @@ static bool llama_kv_cache_init(
// TODO: find a nicer way to add other recurrent model architectures
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
- cache.v_trans = !cparams.flash_attn;
+ cache.v_trans = !cache.recurrent && !cparams.flash_attn;
cache.head = 0;
cache.size = kv_size;
@@ -3556,6 +3576,15 @@ namespace GGUFMeta {
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+// TODO: update when needed or think of some clever automatic way to do this
+static size_t llama_model_max_nodes(const llama_model & /*model*/) {
+ //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
+ // return 32768;
+ //}
+
+ return 8192;
+}
+
struct llama_model_loader {
int n_kv = 0;
int n_tensors = 0;
@@ -4467,14 +4496,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ4_K: return "IQ4_K - 4.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ5_K: return "IQ5_K - 5.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ6_K: return "IQ6_K - 6.6 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.625 bpw Bitnet";
+ case LLAMA_FTYPE_MOSTLY_IQ2_BN: return "IQ2_BN - 2.00 bpw Bitnet";
+ case LLAMA_FTYPE_MOSTLY_IQ2_TN: return "IQT_BN - 2.06 bpw TriLM";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
- case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.625 bpw Bitnet";
- case LLAMA_FTYPE_MOSTLY_IQ2_BN: return "IQ2_BN - 2.00 bpw Bitnet";
- case LLAMA_FTYPE_MOSTLY_IQ2_TN: return "IQT_BN - 2.06 bpw TriLM";
default: return "unknown, may not work";
}
@@ -4898,6 +4927,22 @@ static void llm_load_hparams(
case 40: model.type = e_model::MODEL_14B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
+
+ // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
+ if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
+ // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
+ hparams.n_swa = 2047;
+ } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
+ // default value for Phi-3-mini-128k-instruct
+ hparams.n_swa = 262144;
+ } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
+ // default value for Phi-3-medium-128k-instruct
+ hparams.n_swa = 131072;
+ }
+ bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ if (!found_swa && hparams.n_swa == 0) {
+ throw std::runtime_error("invalid value for sliding_window");
+ }
} break;
case LLM_ARCH_PLAMO:
{
@@ -4965,6 +5010,7 @@ static void llm_load_hparams(
hparams.attn_soft_cap = true;
switch (hparams.n_layer) {
+ case 26: model.type = e_model::MODEL_2B; break;
case 42: model.type = e_model::MODEL_9B; break;
case 46: model.type = e_model::MODEL_27B; break;
default: model.type = e_model::MODEL_UNKNOWN;
@@ -5194,6 +5240,12 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+ model.type = e_model::MODEL_UNKNOWN;
+ } break;
case LLM_ARCH_JAIS:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5588,6 +5640,7 @@ static void llm_load_vocab(
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
+ { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
};
for (const auto & it : special_token_types) {
@@ -5640,6 +5693,17 @@ static void llm_load_vocab(
}
}
}
+
+ // find EOM token: "<|eom_id|>"
+ //
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
+ // for now, we apply this workaround to find the EOM token based on its text
+ if (vocab.special_eom_id == -1) {
+ const auto & t = vocab.token_to_id.find("<|eom_id|>");
+ if (t != vocab.token_to_id.end()) {
+ vocab.special_eom_id = t->second;
+ }
+ }
}
// build special tokens cache
@@ -6062,6 +6126,8 @@ static bool llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
if (n_expert == 0) {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -7423,6 +7489,42 @@ static bool llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ {
+ model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (model.output == NULL) {
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+
+ layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
+ }
+ } break;
case LLM_ARCH_JAIS:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -8104,7 +8206,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
cb(gate, "ffn_moe_gelu", il);
} break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
@@ -8431,7 +8533,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_k_shift() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
GGML_ASSERT(kv_self.size == n_ctx);
@@ -8462,7 +8564,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_s_copy() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
GGML_ASSERT(kv_self.recurrent);
@@ -8485,7 +8587,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
for (uint32_t i = 0; i < ids.size(); ++i) {
const uint32_t id = ids[i];
@@ -8563,6 +8665,10 @@ struct llm_build_context {
// choose long/short freq factors based on the context size
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+ if (model.layers[il].rope_freqs != nullptr) {
+ return model.layers[il].rope_freqs;
+ }
+
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
return model.layers[il].rope_long;
}
@@ -8667,8 +8773,8 @@ struct llm_build_context {
} break;
default:
{
- GGML_ASSERT(false && "unknown pooling type");
- } break;
+ GGML_ABORT("unknown pooling type");
+ }
}
cb(cur, "result_embd_pooled", -1);
@@ -8726,7 +8832,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_llama() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@@ -8757,6 +8863,9 @@ struct llm_build_context {
// self-attention
{
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
+
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
@@ -8780,14 +8889,14 @@ struct llm_build_context {
}
Qcur = ggml_rope_ext(
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
@@ -8869,7 +8978,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_baichuan() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8923,7 +9032,7 @@ struct llm_build_context {
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
@@ -8984,7 +9093,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_xverse() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9087,7 +9196,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_falcon() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9207,7 +9316,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_grok() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@@ -9364,7 +9473,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_dbrx() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@@ -9490,7 +9599,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_starcoder() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9594,7 +9703,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_refact() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9688,7 +9797,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bert() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9882,7 +9991,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bloom() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9983,7 +10092,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_mpt() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10273,7 +10382,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10385,7 +10494,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10497,7 +10606,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen2moe() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@@ -10643,7 +10752,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_phi2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10764,7 +10873,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_phi3() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10996,7 +11105,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gpt2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -11101,7 +11210,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_codeshell() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -11212,7 +11321,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_orion() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11330,7 +11439,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_internlm2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11451,7 +11560,7 @@ struct llm_build_context {
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
// based on the original build_llama() function
struct ggml_cgraph * build_minicpm() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11595,7 +11704,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gemma() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -11703,7 +11812,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gemma2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -11753,9 +11862,10 @@ struct llm_build_context {
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
switch (model.type) {
+ case e_model::MODEL_2B:
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
- default: GGML_ASSERT(false);
+ default: GGML_ABORT("fatal error");
};
cb(Qcur, "Qcur_scaled", il);
@@ -11838,7 +11948,7 @@ struct llm_build_context {
struct ggml_cgraph * build_starcoder2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11957,7 +12067,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_mamba() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t d_model = n_embd;
const int64_t d_conv = hparams.ssm_d_conv;
@@ -12106,7 +12216,7 @@ struct llm_build_context {
struct ggml_cgraph * build_command_r() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12260,7 +12370,7 @@ struct llm_build_context {
// * removed bias
// * removed MoE
struct ggml_cgraph * build_olmo() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@@ -12384,7 +12494,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_openelm() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12509,7 +12619,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gptneox() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -12651,7 +12761,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_arctic() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@@ -12783,7 +12893,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_deepseek2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@@ -13011,7 +13121,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bitnet() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13233,8 +13343,8 @@ struct llm_build_context {
return gf;
}
- struct ggml_cgraph * build_t5() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * build_t5_encoder() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@@ -13248,310 +13358,330 @@ struct llm_build_context {
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
- if (lctx.is_encoding) {
- struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
-
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
+ GGML_ASSERT(lctx.is_encoding);
+ struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * inpSA = inpL;
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
- // norm
- cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm_enc, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "attn_norm", il);
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
- // self-attention
- {
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur);
- cb(Qcur, "Qcur", il);
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm_enc, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur);
- cb(Kcur, "Kcur", il);
+ // self-attention
+ {
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
+ cb(Qcur, "Qcur", il);
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur);
- cb(Vcur, "Vcur", il);
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
+ cb(Kcur, "Kcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
+ cb(Vcur, "Vcur", il);
- struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- cb(kq, "kq", il);
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
- struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
- struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
- struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
- cb(kq_b, "kq_b", il);
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
- kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
- cb(kq, "kq_soft_max_ext", il);
+ struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+ struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
+ struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+ cb(kq_b, "kq_b", il);
- struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
- cb(v, "v", il);
+ kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
- struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
- cb(kqv, "kqv", il);
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+ cb(v, "v", il);
- struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
- cb(kqv_merged, "kqv_merged", il);
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+ cb(kqv, "kqv", il);
- cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
- cb(cur, "kqv_merged_cont", il);
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
- ggml_build_forward_expand(gf, cur);
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
- cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur);
- cb(cur, "kqv_out", il);
- }
+ ggml_build_forward_expand(gf, cur);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- n_tokens = n_outputs;
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
+ cb(cur, "kqv_out", il);
+ }
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
- // feed-forward network
- {
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm_enc, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "ffn_norm", il);
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
- // T5 uses relu, flan-T5 uses gelu-gated
- cur = llm_build_ffn(ctx0, lctx, cur,
- model.layers[il].ffn_up_enc, NULL, NULL,
- model.layers[il].ffn_gate_enc, NULL, NULL,
- model.layers[il].ffn_down_enc, NULL, NULL,
- NULL,
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
- cb, il);
- cb(cur, "ffn_out", il);
- }
+ // feed-forward network
+ {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm_enc, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up_enc, NULL, NULL,
+ model.layers[il].ffn_gate_enc, NULL, NULL,
+ model.layers[il].ffn_down_enc, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ cb, il);
cb(cur, "ffn_out", il);
+ }
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
- cb(cur, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
- // input for next layer
- inpL = cur;
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx0, cur, layer_dir);
}
+ cb(cur, "l_out", il);
- cur = inpL;
- cb(cur, "result_embd", -1);
+ // input for next layer
+ inpL = cur;
+ }
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm_enc, NULL,
- LLM_NORM_RMS, cb, -1);
- cb(cur, "result_norm", -1);
- } else {
- GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
+ cur = inpL;
+ cb(cur, "result_embd", -1);
- struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
- struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm_enc, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
- struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
- struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
+ ggml_build_forward_expand(gf, cur);
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * inpSA = inpL;
+ return gf;
+ }
- // norm
- cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "attn_norm", il);
+ struct ggml_cgraph * build_t5_decoder() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
- // self-attention
- {
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
- llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
- struct ggml_tensor * k =
- ggml_view_3d(ctx0, kv_self.k_l[il],
- n_embd_head_k, n_kv, n_head_kv,
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
- 0);
- cb(k, "k", il);
+ GGML_ASSERT(!lctx.is_encoding);
+ GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
- struct ggml_tensor * v =
- ggml_view_3d(ctx0, kv_self.v_l[il],
- n_kv, n_embd_head_v, n_head_kv,
- ggml_element_size(kv_self.v_l[il])*n_ctx,
- ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
- 0);
- cb(v, "v", il);
+ struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
+ struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
+ struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
- struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
- struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- cb(kq, "kq", il);
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
- struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
- struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
- struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
- cb(kq_b, "kq_b", il);
+ // self-attention
+ {
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
- kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
- cb(kq, "kq_soft_max_ext", il);
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
- struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
- cb(kqv, "kqv", il);
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
- struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
- cb(kqv_merged, "kqv_merged", il);
+ llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
- cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
- cb(cur, "kqv_merged_cont", il);
+ struct ggml_tensor * k =
+ ggml_view_3d(ctx0, kv_self.k_l[il],
+ n_embd_head_k, n_kv, n_head_kv,
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+ 0);
+ cb(k, "k", il);
- ggml_build_forward_expand(gf, cur);
+ struct ggml_tensor * v =
+ ggml_view_3d(ctx0, kv_self.v_l[il],
+ n_kv, n_embd_head_v, n_head_kv,
+ ggml_element_size(kv_self.v_l[il])*n_ctx,
+ ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+ 0);
+ cb(v, "v", il);
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
- cb(cur, "kqv_out", il);
- }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- cur = ggml_add(ctx0, cur, inpSA);
- cb(cur, "cross_inp", il);
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- struct ggml_tensor * inpCA = cur;
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
- // norm
- cur = llm_build_norm(ctx0, cur, hparams,
- model.layers[il].attn_norm_cross, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "attn_norm_cross", il);
+ struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+ struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
+ struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+ cb(kq_b, "kq_b", il);
- // cross-attention
- {
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
- cb(Qcur, "Qcur", il);
+ kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc);
- cb(Kcur, "Kcur", il);
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+ cb(kqv, "kqv", il);
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc);
- cb(Vcur, "Vcur", il);
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
- struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+ ggml_build_forward_expand(gf, cur);
- struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- cb(kq, "kq", il);
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+ cb(cur, "kqv_out", il);
+ }
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
- cb(kq, "kq_soft_max_ext", il);
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "cross_inp", il);
- struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
- cb(v, "v", il);
+ struct ggml_tensor * inpCA = cur;
- struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
- cb(kqv, "kqv", il);
+ // norm
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_norm_cross, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm_cross", il);
- struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
- cb(kqv_merged, "kqv_merged", il);
+ // cross-attention
+ {
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
+ cb(Qcur, "Qcur", il);
- cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
- cb(cur, "kqv_merged_cont", il);
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
+ cb(Kcur, "Kcur", il);
- ggml_build_forward_expand(gf, cur);
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
+ cb(Vcur, "Vcur", il);
- cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur);
- cb(cur, "kqv_out", il);
- }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- n_tokens = n_outputs;
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
- }
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
- cb(ffn_inp, "ffn_inp", il);
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
- // feed-forward network
- {
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "ffn_norm", il);
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
- // T5 uses relu, flan-T5 uses gelu-gated
- cur = llm_build_ffn(ctx0, lctx, cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
- cb, il);
- cb(cur, "ffn_out", il);
- }
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+ cb(v, "v", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+ cb(kqv, "kqv", il);
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
- cb(cur, "l_out", il);
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
- // input for next layer
- inpL = cur;
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
+
+ ggml_build_forward_expand(gf, cur);
+
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
+ cb(cur, "kqv_out", il);
}
- cur = inpL;
- cb(cur, "result_embd", -1);
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+ }
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm, NULL,
- LLM_NORM_RMS, cb, -1);
- cb(cur, "result_norm", -1);
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx0, cur, layer_dir);
+ }
+ cb(cur, "l_out", il);
- // lm_head
- cur = ggml_mul_mat(ctx0, model.output, cur);
- cb(cur, "result_output", -1);
+ // input for next layer
+ inpL = cur;
}
+ cur = inpL;
+ cb(cur, "result_embd", -1);
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_jais() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -13643,7 +13773,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_chatglm() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -13996,14 +14126,22 @@ static struct ggml_cgraph * llama_build_graph(
} break;
case LLM_ARCH_T5:
{
- result = llm.build_t5();
+ if (lctx.is_encoding) {
+ result = llm.build_t5_encoder();
+ } else {
+ result = llm.build_t5_decoder();
+ }
+ } break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ result = llm.build_t5_encoder();
} break;
case LLM_ARCH_JAIS:
{
result = llm.build_jais();
} break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// add on pooling layer
@@ -14715,12 +14853,15 @@ static int llama_decode_internal(
res = nullptr;
embd = nullptr;
} else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
- embd = gf->nodes[gf->n_nodes - 1];
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
- embd = gf->nodes[gf->n_nodes - 2];
+ res = nullptr; // do not extract logits for embedding case
+ embd = nullptr;
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
+ if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
+ embd = gf->nodes[i];
+ break;
+ }
}
- GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
+ GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
} else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@@ -14802,8 +14943,8 @@ static int llama_decode_internal(
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
- GGML_ASSERT(false && "unknown pooling type");
- } break;
+ GGML_ABORT("unknown pooling type");
+ }
}
}
n_outputs_prev += lctx.n_outputs;
@@ -14927,9 +15068,24 @@ static int llama_encode_internal(
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
// the output embeddings after the final encoder normalization
- struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 1];
+ struct ggml_tensor * embd = nullptr;
- GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
+ // there are two cases here
+ if (llama_model_has_decoder(&lctx.model)) {
+ // first case is an encoder-decoder T5 model where embeddings are passed to decoder
+ embd = gf->nodes[gf->n_nodes - 1];
+ GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
+ } else {
+ // second case is an encoder-only T5 model
+ if (cparams.embeddings) {
+ // only output embeddings if required
+ embd = gf->nodes[gf->n_nodes - 1];
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
+ embd = gf->nodes[gf->n_nodes - 2];
+ }
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
+ }
+ }
ggml_backend_sched_alloc_graph(lctx.sched, gf);
@@ -14942,20 +15098,54 @@ static int llama_encode_internal(
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
GGML_ASSERT(backend_embd != nullptr);
- // extract token embeddings
- GGML_ASSERT(lctx.embd != nullptr);
+ if (llama_model_has_decoder(&lctx.model)) {
+ lctx.embd_enc.resize(n_tokens*n_embd);
+ float * embd_out = lctx.embd_enc.data();
- lctx.embd_enc.resize(n_tokens*n_embd);
- float * embd_out = lctx.embd_enc.data();
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
- ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+ // remember the sequence ids used during the encoding - needed for cross attention later
+ lctx.seq_ids_enc.resize(n_tokens);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ for (int s = 0; s < batch.n_seq_id[i]; s++) {
+ llama_seq_id seq_id = batch.seq_id[i][s];
+ lctx.seq_ids_enc[i].insert(seq_id);
+ }
+ }
+ } else {
+ GGML_ASSERT(lctx.embd != nullptr);
- // remember the sequence ids used during the encoding - needed for cross attention later
- lctx.seq_ids_enc.resize(n_tokens);
- for (uint32_t i = 0; i < n_tokens; i++) {
- for (int s = 0; s < batch.n_seq_id[i]; s++) {
- llama_seq_id seq_id = batch.seq_id[i][s];
- lctx.seq_ids_enc[i].insert(seq_id);
+ switch (cparams.pooling_type) {
+ case LLAMA_POOLING_TYPE_NONE:
+ {
+ // extract token embeddings
+ GGML_ASSERT(lctx.embd != nullptr);
+ float * embd_out = lctx.embd;
+
+ GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+ } break;
+ case LLAMA_POOLING_TYPE_MEAN:
+ case LLAMA_POOLING_TYPE_CLS:
+ case LLAMA_POOLING_TYPE_LAST:
+ {
+ // extract sequence embeddings
+ auto & embd_seq_out = lctx.embd_seq;
+ embd_seq_out.clear();
+
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ const llama_seq_id seq_id = batch.seq_id[i][0];
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+ continue;
+ }
+ embd_seq_out[seq_id].resize(n_embd);
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+ }
+ } break;
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
+ {
+ GGML_ABORT("unknown pooling type");
+ }
}
}
}
@@ -14988,9 +15178,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
// - x2 for keys and values
- //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
+ //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
- const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
+ const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
// determine which KV cells to move where
//
@@ -15194,7 +15384,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
// apply K-shift if needed
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
- GGML_ASSERT(false && "Deepseek2 does not support K-shift");
+ GGML_ABORT("Deepseek2 does not support K-shift");
}
{
@@ -15333,7 +15523,7 @@ static void llama_tensor_dequantize_internal(
} else if (ggml_is_quantized(tensor->type)) {
qtype.to_float(tensor->data, f32_output, nelements);
} else {
- GGML_ASSERT(false); // unreachable
+ GGML_ABORT("fatal error"); // unreachable
}
return;
}
@@ -15391,7 +15581,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
if (n_expert > 1) {
- // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
// for getting the current layer as I initially thought, and we need to resort to parsing the
// tensor name.
@@ -16706,6 +16896,8 @@ struct llama_context * llama_new_context_with_model(
ctx->sampling.rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
+ // build worst-case graph for encoder if a model contains encoder
+ ctx->is_encoding = llama_model_has_encoder(model);
uint32_t kv_size = cparams.n_ctx;
ggml_type type_k = params.type_k;
@@ -16921,8 +17113,10 @@ struct llama_context * llama_new_context_with_model(
}
}
+ const size_t max_nodes = llama_model_max_nodes(*model);
+
// buffer used to store the computation graph and the tensor meta data
- ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
@@ -16935,7 +17129,7 @@ struct llama_context * llama_new_context_with_model(
// currently this is only implemented in the CUDA backend
pipeline_parallel = false;
#endif
- ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
+ ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
if (pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
@@ -17018,6 +17212,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_MAMBA:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_T5:
+ case LLM_ARCH_T5ENCODER:
case LLM_ARCH_JAIS:
return LLAMA_ROPE_TYPE_NONE;
@@ -17060,8 +17255,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
- GGML_ASSERT(false && "unknown architecture");
- break;
+ GGML_ABORT("unknown architecture");
}
return LLAMA_ROPE_TYPE_NONE;
@@ -17166,8 +17360,16 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
bool llama_model_has_encoder(const struct llama_model * model) {
switch (model->arch) {
- case LLM_ARCH_T5: return true;
- default: return false;
+ case LLM_ARCH_T5: return true;
+ case LLM_ARCH_T5ENCODER: return true;
+ default: return false;
+ }
+}
+
+bool llama_model_has_decoder(const struct llama_model * model) {
+ switch (model->arch) {
+ case LLM_ARCH_T5ENCODER: return false;
+ default: return true;
}
}
@@ -17443,18 +17645,18 @@ void llama_kv_cache_update(struct llama_context * ctx) {
}
// deprecated
-size_t llama_get_state_size(const struct llama_context * ctx) {
+size_t llama_get_state_size(struct llama_context * ctx) {
return llama_state_get_size(ctx);
}
// deprecated
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
- return llama_state_get_data(ctx, dst);
+ return llama_state_get_data(ctx, dst, -1);
}
// deprecated
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
- return llama_state_set_data(ctx, src);
+ return llama_state_set_data(ctx, src, -1);
}
// deprecated
@@ -17467,302 +17669,282 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
}
-// Returns the *maximum* size of the state
-size_t llama_state_get_size(const struct llama_context * ctx) {
- const auto & cparams = ctx->cparams;
- const auto & hparams = ctx->model.hparams;
-
- // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
- // for reference, std::mt19937(1337) serializes to 6701 bytes.
- const size_t s_rng_size = sizeof(size_t);
- const size_t s_rng = LLAMA_MAX_RNG_STATE;
- const size_t s_n_outputs = sizeof(size_t);
- // assume worst case for outputs although only currently set ones are serialized
- const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
- const size_t s_logits_size = sizeof(size_t);
- const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
- const size_t s_embedding_size = sizeof(size_t);
- const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
- const size_t s_kv_buf_size = sizeof(size_t);
- const size_t s_kv_head = sizeof(uint32_t);
- const size_t s_kv_size = sizeof(uint32_t);
- const size_t s_kv_used = sizeof(uint32_t);
- const size_t s_v_trans = sizeof(uint32_t);
- const size_t s_kv = ctx->kv_self.total_size();
- const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
- const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
-
- const size_t s_total = (
- + s_rng_size
- + s_rng
- + s_n_outputs
- + s_output_pos
- + s_logits_size
- + s_logits
- + s_embedding_size
- + s_embedding
- + s_kv_buf_size
- + s_kv_head
- + s_kv_size
- + s_kv_used
- + s_v_trans
- + s_kv
- + s_kv_cells
- );
-
- // on session change it is very likely that the state size has changed - so we need to update this function
- static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
-
- return s_total;
-}
-
-// llama_context_data
-struct llama_data_context {
+// TODO: replace all non-fatal assertions with returned errors or exceptions
+struct llama_data_write {
virtual void write(const void * src, size_t size) = 0;
+ virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
virtual size_t get_size_written() = 0;
- virtual ~llama_data_context() = default;
-};
-
-struct llama_data_buffer_context : llama_data_context {
- uint8_t * ptr;
- size_t size_written = 0;
+ virtual ~llama_data_write() = default;
- llama_data_buffer_context(uint8_t * p) : ptr(p) {}
+ void write_string(const std::string & str) {
+ uint32_t str_size = str.size();
- void write(const void * src, size_t size) override {
- memcpy(ptr, src, size);
- ptr += size;
- size_written += size;
+ write(&str_size, sizeof(str_size));
+ write(str.data(), str_size);
}
- size_t get_size_written() override {
- return size_written;
+ void write_model_info(const struct llama_context * ctx) {
+ std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
+ write_string(arch_str);
+ // TODO: add more model-specific info which should prevent loading the session file if not identical
}
-};
-struct llama_data_file_context : llama_data_context {
- llama_file * file;
- size_t size_written = 0;
+ void write_rng(const std::mt19937 & rng) {
+ std::ostringstream rng_ss;
+ rng_ss << rng;
- llama_data_file_context(llama_file * f) : file(f) {}
+ const std::string & rng_str = rng_ss.str();
- void write(const void * src, size_t size) override {
- file->write_raw(src, size);
- size_written += size;
+ write_string(rng_str);
}
- size_t get_size_written() override {
- return size_written;
- }
-};
+ void write_output_ids(const struct llama_context * ctx) {
+ const uint32_t n_outputs = ctx->n_outputs;
-/** copy state data into either a buffer or file depending on the passed in context
- *
- * file context:
- * llama_file file("/path", "wb");
- * llama_data_file_context data_ctx(&file);
- * llama_state_get_data(ctx, &data_ctx);
- *
- * buffer context:
- * std::vector<uint8_t> buf(max_size, 0);
- * llama_data_buffer_context data_ctx(&buf.data());
- * llama_state_get_data(ctx, &data_ctx);
- *
-*/
-static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
- llama_synchronize(ctx);
+ std::vector<int32_t> output_pos;
- // copy rng
- {
- std::ostringstream rng_ss;
- rng_ss << ctx->sampling.rng;
+ const size_t n_batch = ctx->cparams.n_batch;
+ const auto & output_ids = ctx->output_ids;
- const std::string & rng_str = rng_ss.str();
- const size_t rng_size = rng_str.size();
+ GGML_ASSERT(n_outputs <= ctx->output_size);
- GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
+ output_pos.resize(n_outputs);
- data_ctx->write(&rng_size, sizeof(rng_size));
- data_ctx->write(rng_str.data(), rng_size);
+ // build a more compact representation of the output ids
+ for (size_t i = 0; i < n_batch; ++i) {
+ // map an output id to a position in the batch
+ int32_t pos = output_ids[i];
+ if (pos >= 0) {
+ GGML_ASSERT((uint32_t) pos < n_outputs);
+ output_pos[pos] = i;
+ }
+ }
+
+ write(&n_outputs, sizeof(n_outputs));
+
+ if (n_outputs) {
+ write(output_pos.data(), n_outputs * sizeof(int32_t));
+ }
}
- // copy outputs
- {
- // Can't use ctx->n_outputs because it's not for the
- // entire last batch when n_ubatch is smaller than n_batch
- size_t n_outputs = 0;
+ void write_logits(const struct llama_context * ctx) {
+ const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
- // copy output ids
- {
- std::vector<int32_t> output_pos;
+ write(&logits_size, sizeof(logits_size));
+
+ if (logits_size) {
+ write(ctx->logits, logits_size * sizeof(float));
+ }
+ }
+
+ void write_embeddings(const struct llama_context * ctx) {
+ const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
+
+ write(&embeddings_size, sizeof(embeddings_size));
+
+ if (embeddings_size) {
+ write(ctx->embd, embeddings_size * sizeof(float));
+ }
+ }
+
+ void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
- const size_t n_batch = ctx->cparams.n_batch;
- const auto & output_ids = ctx->output_ids;
+ for (const auto & range : cell_ranges) {
+ for (uint32_t i = range.first; i < range.second; ++i) {
+ const auto & cell = kv_self.cells[i];
+ const llama_pos pos = cell.pos;
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
- output_pos.resize(ctx->output_size);
+ write(&pos, sizeof(pos));
+ write(&n_seq_id, sizeof(n_seq_id));
- // build a more compact representation of the output ids
- for (size_t i = 0; i < n_batch; ++i) {
- // map an output id to a position in the batch
- int32_t pos = output_ids[i];
- if (pos >= 0) {
- if ((size_t) pos >= n_outputs) {
- n_outputs = pos + 1;
+ if (n_seq_id) {
+ for (auto seq_id : cell.seq_id) {
+ write(&seq_id, sizeof(seq_id));
}
- GGML_ASSERT((size_t) pos < ctx->output_size);
- output_pos[pos] = i;
}
}
+ }
+ }
- data_ctx->write(&n_outputs, sizeof(n_outputs));
+ void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
+ const struct llama_kv_cache & kv_self = ctx->kv_self;
+ const struct llama_hparams & hparams = ctx->model.hparams;
- if (n_outputs) {
- data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
- }
- }
+ const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
+ const uint32_t n_layer = hparams.n_layer;
- // copy logits
- {
- const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
+ write(&v_trans, sizeof(v_trans));
+ write(&n_layer, sizeof(n_layer));
- data_ctx->write(&logits_size, sizeof(logits_size));
+ std::vector<uint8_t> tmp_buf;
- if (logits_size) {
- data_ctx->write(ctx->logits, logits_size * sizeof(float));
- }
- }
+ // Iterate and write all the keys first, each row is a cell
+ // Get whole range at a time
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
- // copy embeddings
- {
- const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
+ // Write key type
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+ write(&k_type_i, sizeof(k_type_i));
- data_ctx->write(&embeddings_size, sizeof(embeddings_size));
+ // Write row size of key
+ const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+ write(&k_size_row, sizeof(k_size_row));
- if (embeddings_size) {
- data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
+ // Read each range of cells of k_size length each into tmp_buf and write out
+ for (const auto & range : cell_ranges) {
+ const size_t range_size = range.second - range.first;
+ const size_t buf_size = range_size * k_size_row;
+ write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
}
}
- }
- // copy kv cache
- {
- const auto & kv_self = ctx->kv_self;
- const auto & hparams = ctx->model.hparams;
-
- const uint32_t n_layer = hparams.n_layer;
-
- // NOTE: kv_size and kv_buf_size are mostly used for sanity checks
- const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
- const uint32_t kv_size = kv_self.size;
- const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
- const uint32_t kv_used = kv_self.used;
- const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
-
- data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
- data_ctx->write(&kv_head, sizeof(kv_head));
- data_ctx->write(&kv_size, sizeof(kv_size));
- data_ctx->write(&kv_used, sizeof(kv_used));
- data_ctx->write(&v_trans, sizeof(v_trans));
-
- if (kv_buf_size) {
- const size_t pre_kv_buf_size = data_ctx->get_size_written();
-
- std::vector<uint8_t> tmp_buf;
- for (int il = 0; il < (int) n_layer; ++il) {
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+ if (!kv_self.v_trans) {
+ for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
- const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
-
- tmp_buf.resize(k_size);
- ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
- data_ctx->write(tmp_buf.data(), tmp_buf.size());
+ // Write value type
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+ write(&v_type_i, sizeof(v_type_i));
- if (kv_self.recurrent || !kv_self.v_trans) {
- // v is contiguous for recurrent models
- // TODO: use other tensors for state models than k and v
- const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
+ // Write row size of value
+ const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+ write(&v_size_row, sizeof(v_size_row));
- tmp_buf.resize(v_size);
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size());
- data_ctx->write(tmp_buf.data(), tmp_buf.size());
- continue;
+ // Read each range of cells of v_size length each into tmp_buf and write out
+ for (const auto & range : cell_ranges) {
+ const size_t range_size = range.second - range.first;
+ const size_t buf_size = range_size * v_size_row;
+ write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
}
+ }
+ } else {
+ // When v is transposed, we also need the element size and get the element ranges from each row
+ const uint32_t kv_size = kv_self.size;
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+ // Write value type
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+ write(&v_type_i, sizeof(v_type_i));
- // v is not contiguous, copy row by row
- const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
+ // Write element size
+ const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+ write(&v_size_el, sizeof(v_size_el));
- tmp_buf.resize(v_row_size);
- for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
- data_ctx->write(tmp_buf.data(), tmp_buf.size());
+ // Write GQA embedding size
+ write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+ // For each row, we get the element values of each cell
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
+ for (const auto & range : cell_ranges) {
+ const size_t range_size = range.second - range.first;
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+ const size_t buf_size = range_size * v_size_el;
+ write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
+ }
}
}
- GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
}
+ }
- for (uint32_t i = 0; i < kv_head; ++i) {
- const auto & cell = kv_self.cells[i];
-
- const llama_pos pos = cell.pos;
- const size_t seq_id_size = cell.seq_id.size();
-
- data_ctx->write(&pos, sizeof(pos));
- data_ctx->write(&seq_id_size, sizeof(seq_id_size));
+ void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
+ const struct llama_kv_cache & kv_self = ctx->kv_self;
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+ uint32_t cell_count = 0;
- for (auto seq_id : cell.seq_id) {
- data_ctx->write(&seq_id, sizeof(seq_id));
+ // Count the number of cells with the specified seq_id
+ // Find all the ranges of cells with this seq id (or all, when -1)
+ uint32_t cell_range_begin = kv_self.size;
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
+ const auto & cell = kv_self.cells[i];
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+ ++cell_count;
+ if (cell_range_begin == kv_self.size) {
+ cell_range_begin = i;
+ }
+ } else {
+ if (cell_range_begin != kv_self.size) {
+ cell_ranges.emplace_back(cell_range_begin, i);
+ cell_range_begin = kv_self.size;
+ }
}
}
- }
-}
+ if (cell_range_begin != kv_self.size) {
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
+ }
-size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
- llama_data_buffer_context data_ctx(dst);
- llama_state_get_data_internal(ctx, &data_ctx);
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+ uint32_t cell_count_check = 0;
+ for (const auto & range : cell_ranges) {
+ cell_count_check += range.second - range.first;
+ }
+ GGML_ASSERT(cell_count == cell_count_check);
- return data_ctx.get_size_written();
-}
+ write(&cell_count, sizeof(cell_count));
-// Sets the state reading from the specified source address
-size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
- llama_synchronize(ctx);
+ write_kv_cache_meta(kv_self, cell_ranges, seq_id);
+ write_kv_cache_data(ctx, cell_ranges);
+ }
+};
- const uint8_t * inp = src;
+struct llama_data_read {
+ virtual const uint8_t * read(size_t size) = 0;
+ virtual void read_to(void * dst, size_t size) = 0;
+ virtual size_t get_size_read() = 0;
+ virtual ~llama_data_read() = default;
- // set rng
- {
- size_t rng_size;
- memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
+ void read_string(std::string & str) {
+ uint32_t str_size;
+ read_to(&str_size, sizeof(str_size));
+
+ str.assign((const char *) read(str_size), str_size);
+ }
- GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
+ // validate model information
+ void read_model_info(const struct llama_context * ctx) {
+ std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
+ std::string arch_str;
+ read_string(arch_str);
+ if (cur_arch_str != arch_str) {
+ throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
+ }
+ // TODO: add more info which needs to be identical but which is not verified otherwise
+ }
- std::string rng_str((const char *)inp, rng_size); inp += rng_size;
+ void read_rng(std::mt19937 & rng) {
+ std::string rng_str;
+ read_string(rng_str);
std::istringstream rng_ss(rng_str);
- rng_ss >> ctx->sampling.rng;
+ rng_ss >> rng;
- GGML_ASSERT(!rng_ss.fail());
+ if (rng_ss.fail()) {
+ throw std::runtime_error("failed to load RNG state");
+ }
}
- // set output ids
- {
- size_t n_outputs;
+ void read_output_ids(struct llama_context * ctx) {
std::vector<int32_t> output_pos;
- memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
+ uint32_t n_outputs;
+ read_to(&n_outputs, sizeof(n_outputs));
- GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
+ if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
+ throw std::runtime_error("could not reserve outputs");
+ }
if (n_outputs) {
output_pos.resize(n_outputs);
- memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
- inp += n_outputs * sizeof(int32_t);
+ read_to(output_pos.data(), n_outputs * sizeof(int32_t));
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
int32_t id = output_pos[i];
- GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
+ if ((uint32_t) id >= ctx->cparams.n_batch) {
+ throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
+ }
ctx->output_ids[id] = i;
}
@@ -17770,611 +17952,571 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
}
}
- // set logits
- {
- size_t logits_size;
+ void read_logits(struct llama_context * ctx) {
+ uint64_t logits_size;
+ read_to(&logits_size, sizeof(logits_size));
- memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
-
- GGML_ASSERT(ctx->logits_size >= logits_size);
+ if (ctx->logits_size < logits_size) {
+ throw std::runtime_error("logits buffer too small");
+ }
if (logits_size) {
- memcpy(ctx->logits, inp, logits_size * sizeof(float));
- inp += logits_size * sizeof(float);
+ read_to(ctx->logits, logits_size * sizeof(float));
}
}
- // set embeddings
- {
- size_t embeddings_size;
-
- memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
+ void read_embeddings(struct llama_context * ctx) {
+ uint64_t embeddings_size;
+ read_to(&embeddings_size, sizeof(embeddings_size));
- GGML_ASSERT(ctx->embd_size >= embeddings_size);
+ if (ctx->embd_size < embeddings_size) {
+ throw std::runtime_error("embeddings buffer too small");
+ }
if (embeddings_size) {
- memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
- inp += embeddings_size * sizeof(float);
+ read_to(ctx->embd, embeddings_size * sizeof(float));
}
}
- // set kv cache
- {
- const auto & kv_self = ctx->kv_self;
- const auto & hparams = ctx->model.hparams;
+ bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
+ struct llama_kv_cache & kv_self = ctx->kv_self;
- const uint32_t n_layer = hparams.n_layer;
+ if (dest_seq_id != -1) {
+ // single sequence
- size_t kv_buf_size;
- uint32_t kv_head;
- uint32_t kv_size;
- uint32_t kv_used;
- uint32_t v_trans;
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
- memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
- memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
- memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
- memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
- memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
+ llama_batch batch = llama_batch_init(cell_count, 0, 1);
+ batch.n_tokens = cell_count;
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ llama_pos pos;
+ uint32_t n_seq_id;
- GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
+ read_to(&pos, sizeof(pos));
+ read_to(&n_seq_id, sizeof(n_seq_id));
- if (kv_self.size != kv_size) {
- // the KV cache needs to be big enough to load all the KV cells from the saved state
- GGML_ASSERT(kv_self.size >= kv_head);
+ if (n_seq_id != 0) {
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+ return false;
+ }
- LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
- __func__, kv_head, kv_size, kv_self.size);
- }
+ batch.pos[i] = pos;
+ batch.n_seq_id[i] = 1;
+ batch.seq_id[i][0] = dest_seq_id;
+ }
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
+ llama_batch_free(batch);
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+ return false;
+ }
- llama_kv_cache_clear(ctx);
+ // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+ // Assume that this is one contiguous block of cells
+ GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
+ GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+ GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
- if (kv_buf_size) {
- const size_t pre_kv_buf_size = inp - src;
+ // Cleanup
+ llama_batch_free(batch);
+ } else {
+ // whole KV cache restore
- GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
+ if (cell_count > kv_self.size) {
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+ return false;
+ }
- for (int il = 0; il < (int) n_layer; ++il) {
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+ llama_kv_cache_clear(kv_self);
- const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
+ for (uint32_t i = 0; i < cell_count; ++i) {
+ llama_kv_cell & cell = kv_self.cells[i];
- ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
- inp += k_size;
+ llama_pos pos;
+ uint32_t n_seq_id;
- if (kv_self.recurrent || !kv_self.v_trans) {
- // v is contiguous for recurrent models
- // TODO: use other tensors for state models than k and v
- const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
+ read_to(&pos, sizeof(pos));
+ read_to(&n_seq_id, sizeof(n_seq_id));
- ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size);
- inp += v_size;
- continue;
- }
+ cell.pos = pos;
+
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
+ llama_seq_id seq_id;
+ read_to(&seq_id, sizeof(seq_id));
- // v is not contiguous, copy row by row
- const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
+ if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+ return false;
+ }
- for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
- ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
- inp += v_row_size;
+ cell.seq_id.insert(seq_id);
}
}
- GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
+
+ kv_self.head = 0;
+ kv_self.used = cell_count;
}
- ctx->kv_self.head = kv_head;
- ctx->kv_self.used = kv_used;
+ return true;
+ }
- for (uint32_t i = 0; i < kv_head; ++i) {
- llama_pos pos;
- size_t seq_id_size;
+ bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
+ const struct llama_hparams & hparams = ctx->model.hparams;
+ struct llama_kv_cache & kv_self = ctx->kv_self;
+ uint32_t v_trans;
+ uint32_t n_layer;
+ read_to(&v_trans, sizeof(v_trans));
+ read_to(&n_layer, sizeof(n_layer));
- memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
- memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
+ if (n_layer != hparams.n_layer) {
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+ return false;
+ }
+ if (cell_count > kv_self.size) {
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
+ return false;
+ }
+ if (kv_self.v_trans != (bool) v_trans) {
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+ return false;
+ }
- ctx->kv_self.cells[i].pos = pos;
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
- llama_seq_id seq_id;
+ // Read type of key
+ int32_t k_type_i_ref;
+ read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+ if (k_type_i != k_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+ return false;
+ }
+
+ // Read row size of key
+ uint64_t k_size_row_ref;
+ read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+ if (k_size_row != k_size_row_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+ return false;
+ }
- for (size_t j = 0; j < seq_id_size; ++j) {
- memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
- ctx->kv_self.cells[i].seq_id.insert(seq_id);
+ if (cell_count) {
+ // Read and set the keys for the whole cell range
+ ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
}
}
- }
- const size_t nread = inp - src;
- const size_t max_size = llama_state_get_size(ctx);
+ if (!kv_self.v_trans) {
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
- GGML_ASSERT(nread <= max_size);
+ // Read type of value
+ int32_t v_type_i_ref;
+ read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+ if (v_type_i != v_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+ return false;
+ }
- return nread;
-}
+ // Read row size of value
+ uint64_t v_size_row_ref;
+ read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+ if (v_size_row != v_size_row_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+ return false;
+ }
-static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
- llama_file file(path_session, "rb");
+ if (cell_count) {
+ // Read and set the values for the whole cell range
+ ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
+ }
+ }
+ } else {
+ // For each layer, read the values for each cell (transposed)
+ for (uint32_t il = 0; il < n_layer; ++il) {
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
- // sanity checks
- {
- const uint32_t magic = file.read_u32();
- const uint32_t version = file.read_u32();
+ // Read type of value
+ int32_t v_type_i_ref;
+ read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+ if (v_type_i != v_type_i_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+ return false;
+ }
- if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
- LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
- return false;
- }
+ // Read element size of value
+ uint32_t v_size_el_ref;
+ read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+ if (v_size_el != v_size_el_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+ return false;
+ }
- llama_hparams session_hparams;
- file.read_raw(&session_hparams, sizeof(llama_hparams));
+ // Read GQA embedding size
+ uint32_t n_embd_v_gqa_ref;
+ read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+ return false;
+ }
- if (session_hparams != ctx->model.hparams) {
- LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
- return false;
+ if (cell_count) {
+ // For each row in the transposed matrix, read the values for the whole cell range
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+ const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
+ ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+ }
+ }
+ }
}
+ return true;
}
- // load the prompt
- {
- const uint32_t n_token_count = file.read_u32();
+ void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
+ uint32_t cell_count;
+ read_to(&cell_count, sizeof(cell_count));
- if (n_token_count > n_token_capacity) {
- LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
- return false;
- }
+ bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
- file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
- *n_token_count_out = n_token_count;
+ if (!res) {
+ if (seq_id == -1) {
+ llama_kv_cache_clear(ctx);
+ } else {
+ llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
+ }
+ throw std::runtime_error("failed to restore kv cache");
+ }
}
+};
- // restore the context state
- {
- const size_t n_state_size_cur = file.size - file.tell();
- const size_t n_state_size_max = llama_state_get_size(ctx);
-
- if (n_state_size_cur > n_state_size_max) {
- LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
- return false;
- }
+struct llama_data_write_dummy : llama_data_write {
+ size_t size_written = 0;
- std::vector<uint8_t> state_data(n_state_size_max);
- file.read_raw(state_data.data(), n_state_size_cur);
+ llama_data_write_dummy() {}
- llama_state_set_data(ctx, state_data.data());
+ void write(const void * /* src */, size_t size) override {
+ size_written += size;
}
- return true;
-}
-
-bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
- try {
- return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
- } catch (const std::exception & err) {
- LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
- return false;
+ void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
+ size_written += size;
}
-}
-
-static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
- llama_file file(path_session, "wb");
- file.write_u32(LLAMA_SESSION_MAGIC);
- file.write_u32(LLAMA_SESSION_VERSION);
-
- file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
-
- // save the prompt
- file.write_u32((uint32_t) n_token_count);
- file.write_raw(tokens, sizeof(llama_token) * n_token_count);
-
- // save the context state using stream saving
- llama_data_file_context data_ctx(&file);
- llama_state_get_data_internal(ctx, &data_ctx);
-
- return true;
-}
-
-bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
- try {
- return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
- } catch (const std::exception & err) {
- LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
- return false;
+ size_t get_size_written() override {
+ return size_written;
}
-}
-
-size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
- // save the size of size_t as a uint32_t for safety check
- const size_t size_t_size_size = sizeof(uint32_t);
+};
- // other values
- const size_t s_cell_count_size = sizeof(uint32_t);
- const size_t s_layer_count_size = sizeof(uint32_t);
- const size_t n_embd_v_gqa_size = sizeof(uint32_t);
+struct llama_data_write_buffer : llama_data_write {
+ uint8_t * ptr;
+ size_t buf_size = 0;
+ size_t size_written = 0;
- size_t s_cell_count = 0;
- size_t s_cell_data_size = 0;
- const auto & kv_self = ctx->kv_self;
- const auto & hparams = ctx->model.hparams;
+ llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
- const uint32_t n_layer = hparams.n_layer;
+ void write(const void * src, size_t size) override {
+ if (size > buf_size) {
+ throw std::runtime_error("unexpectedly reached end of buffer");
+ }
+ memcpy(ptr, src, size);
+ ptr += size;
+ size_written += size;
+ buf_size -= size;
+ }
- for (uint32_t i = 0; i < kv_self.size; ++i) {
- const auto & cell = kv_self.cells[i];
- if (cell.seq_id.count(seq_id) > 0) {
- ++s_cell_count;
- s_cell_data_size += sizeof(llama_pos);
+ void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
+ if (size > buf_size) {
+ throw std::runtime_error("unexpectedly reached end of buffer");
}
+ ggml_backend_tensor_get(tensor, ptr, offset, size);
+ ptr += size;
+ size_written += size;
+ buf_size -= size;
}
- for (int il = 0; il < (int)n_layer; ++il) {
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+ size_t get_size_written() override {
+ return size_written;
+ }
+};
- // types of keys and values
- s_cell_data_size += sizeof(int32_t) * 2;
- // k_size_row and v_size_el values of layer
- s_cell_data_size += sizeof(size_t) * 2;
+struct llama_data_read_buffer : llama_data_read {
+ const uint8_t * ptr;
+ size_t buf_size = 0;
+ size_t size_read = 0;
- // keys
- const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
- s_cell_data_size += k_size_row * s_cell_count;
+ llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
- // values (transposed)
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
- s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
+ const uint8_t * read(size_t size) override {
+ const uint8_t * base_ptr = ptr;
+ if (size > buf_size) {
+ throw std::runtime_error("unexpectedly reached end of buffer");
+ }
+ ptr += size;
+ size_read += size;
+ buf_size -= size;
+ return base_ptr;
}
- const size_t s_total = (
- size_t_size_size +
- s_cell_count_size +
- s_layer_count_size +
- n_embd_v_gqa_size +
- s_cell_data_size
- );
-
- return s_total;
-}
-
-static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
- llama_synchronize(ctx);
+ void read_to(void * dst, size_t size) override {
+ memcpy(dst, read(size), size);
+ }
- const auto & kv_self = ctx->kv_self;
- GGML_ASSERT(!kv_self.recurrent); // not implemented
+ size_t get_size_read() override {
+ return size_read;
+ }
+};
- // Save the size of size_t as a uint32_t for safety check
- const uint32_t size_t_size = sizeof(size_t);
- data_ctx.write(&size_t_size, sizeof(size_t_size));
+struct llama_data_write_file : llama_data_write {
+ llama_file * file;
+ size_t size_written = 0;
+ std::vector<uint8_t> temp_buffer;
- std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
- uint32_t cell_count = 0;
+ llama_data_write_file(llama_file * f) : file(f) {}
- // Count the number of cells with the specified seq_id
- // Find all the ranges of cells with this seq id
- {
- uint32_t cell_range_begin = kv_self.size;
- for (uint32_t i = 0; i < kv_self.size; ++i) {
- const auto & cell = kv_self.cells[i];
- if (cell.has_seq_id(seq_id)) {
- ++cell_count;
- if (cell_range_begin == kv_self.size) {
- cell_range_begin = i;
- }
- }
- else {
- if (cell_range_begin != kv_self.size) {
- cell_ranges.emplace_back(cell_range_begin, i);
- cell_range_begin = kv_self.size;
- }
- }
- }
- if (cell_range_begin != kv_self.size) {
- cell_ranges.emplace_back(cell_range_begin, kv_self.size);
- }
+ void write(const void * src, size_t size) override {
+ file->write_raw(src, size);
+ size_written += size;
+ }
- // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
- uint32_t cell_count_check = 0;
- for (const auto & range : cell_ranges) {
- cell_count_check += range.second - range.first;
- }
- GGML_ASSERT(cell_count == cell_count_check);
+ void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
+ temp_buffer.resize(size);
+ ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
+ write(temp_buffer.data(), temp_buffer.size());
}
- // Write the cell count
- data_ctx.write(&cell_count, sizeof(cell_count));
+ size_t get_size_written() override {
+ return size_written;
+ }
+};
- const auto & hparams = ctx->model.hparams;
- const uint32_t n_layer = hparams.n_layer;
+struct llama_data_read_file : llama_data_read {
+ llama_file * file;
+ size_t size_read = 0;
+ std::vector<uint8_t> temp_buffer;
- // Write the layer count
- data_ctx.write(&n_layer, sizeof(n_layer));
+ llama_data_read_file(llama_file * f) : file(f) {}
- // Write n_embd_v_gqa (reference value)
- {
- const uint32_t n_embd_v_gqa_ref = hparams.n_embd_v_gqa() + hparams.n_embd_k_s();
- data_ctx.write(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+ void read_to(void * dst, size_t size) override {
+ file->read_raw(dst, size);
+ size_read += size;
}
- // Iterate the ranges and write all the pos (this is the token position in the prompt)
- for (const auto & range : cell_ranges) {
- for (uint32_t i = range.first; i < range.second; ++i) {
- const auto & cell = kv_self.cells[i];
- data_ctx.write(&cell.pos, sizeof(cell.pos));
- }
+ const uint8_t * read(size_t size) override {
+ temp_buffer.resize(size);
+ read_to(temp_buffer.data(), size);
+ return temp_buffer.data();
}
- // Iterate and write all the keys first, each row is a cell
- // Get whole range at a time
- std::vector<uint8_t> tmp_buf;
- for (int il = 0; il < (int)n_layer; ++il) {
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+ size_t get_size_read() override {
+ return size_read;
+ }
+};
- // Write key type
- const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
- data_ctx.write(&k_type_i, sizeof(k_type_i));
+/** copy state data into either a buffer or file depending on the passed in context
+ *
+ * file context:
+ * llama_file file("/path", "wb");
+ * llama_data_write_file data_ctx(&file);
+ * llama_state_get_data_internal(ctx, data_ctx);
+ *
+ * buffer context:
+ * std::vector<uint8_t> buf(max_size, 0);
+ * llama_data_write_buffer data_ctx(buf.data(), max_size);
+ * llama_state_get_data_internal(ctx, data_ctx);
+ *
+*/
+static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
+ llama_synchronize(ctx);
- // Write row size of key
- const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
- data_ctx.write(&k_size_row, sizeof(k_size_row));
+ data_ctx.write_model_info(ctx);
- // Read each range of cells of k_size length each into tmp_buf and write out
- for (const auto & range : cell_ranges) {
- const size_t range_size = range.second - range.first;
- tmp_buf.resize(range_size * k_size_row);
- ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
- data_ctx.write(tmp_buf.data(), tmp_buf.size());
- }
- }
+ data_ctx.write_rng(ctx->sampling.rng);
- // TODO: simplify, reduce copy-paste
- if (!kv_self.v_trans) {
- for (int il = 0; il < (int)n_layer; ++il) {
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+ // copy outputs
+ data_ctx.write_output_ids(ctx);
+ data_ctx.write_logits(ctx);
+ data_ctx.write_embeddings(ctx);
- // Write value type
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
- data_ctx.write(&v_type_i, sizeof(v_type_i));
+ data_ctx.write_kv_cache(ctx);
- // Write row size of value
- const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
- data_ctx.write(&v_size_row, sizeof(v_size_row));
+ return data_ctx.get_size_written();
+}
- // Read each range of cells of v_size length each into tmp_buf and write out
- for (const auto & range : cell_ranges) {
- const size_t range_size = range.second - range.first;
- tmp_buf.resize(range_size * v_size_row);
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
- data_ctx.write(tmp_buf.data(), tmp_buf.size());
- }
- }
- } else {
- // For the values, they are transposed, so we also need the element size and get the element ranges from each row
- const uint32_t kv_size = kv_self.size;
- for (int il = 0; il < (int)n_layer; ++il) {
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
- // Write value type
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
- data_ctx.write(&v_type_i, sizeof(v_type_i));
-
- // Write element size
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
- data_ctx.write(&v_size_el, sizeof(v_size_el));
-
- // For each row, we get the element values of each cell
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
- // Read each range of cells of v_size_el length each into tmp_buf and write out
- for (const auto & range : cell_ranges) {
- const size_t range_size = range.second - range.first;
- const size_t src_offset = (range.first + j * kv_size) * v_size_el;
- tmp_buf.resize(range_size * v_size_el);
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
- data_ctx.write(tmp_buf.data(), tmp_buf.size());
- }
- }
- }
+size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
+ llama_data_write_buffer data_ctx(dst, size);
+ try {
+ return llama_state_get_data_internal(ctx, data_ctx);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+ return 0;
}
-
- return data_ctx.get_size_written();
}
-size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
- llama_data_buffer_context data_ctx(dst);
- return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+// Returns the *actual* size of the state.
+// Intended to be used when saving to state to a buffer.
+size_t llama_state_get_size(struct llama_context * ctx) {
+ llama_data_write_dummy data_ctx;
+ try {
+ return llama_state_get_data_internal(ctx, data_ctx);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+ return 0;
+ }
}
-size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
+static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
llama_synchronize(ctx);
- auto & kv_self = ctx->kv_self;
- GGML_ASSERT(!kv_self.recurrent); // not implemented
+ data_ctx.read_model_info(ctx);
+
+ // set rng
+ data_ctx.read_rng(ctx->sampling.rng);
- // Wipe the slot
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
+ // set outputs
+ data_ctx.read_output_ids(ctx);
+ data_ctx.read_logits(ctx);
+ data_ctx.read_embeddings(ctx);
- const uint8_t * inp = src;
+ data_ctx.read_kv_cache(ctx);
- // Read size of size_t
- uint32_t size_t_size;
- memcpy(&size_t_size, inp, sizeof(size_t_size));
- inp += sizeof(size_t_size);
- if (size_t_size != sizeof(size_t)) {
- LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
+ return data_ctx.get_size_read();
+}
+
+// Sets the state reading from the specified source address
+size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
+ llama_data_read_buffer data_ctx(src, size);
+ try {
+ return llama_state_set_data_internal(ctx, data_ctx);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
return 0;
}
+}
- // Read the cell count
- uint32_t cell_count;
- memcpy(&cell_count, inp, sizeof(cell_count));
- inp += sizeof(cell_count);
+static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+ llama_file file(path_session, "rb");
- // Read the layer count
- uint32_t n_layer_ref;
- memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
- inp += sizeof(n_layer_ref);
+ // sanity checks
+ {
+ const uint32_t magic = file.read_u32();
+ const uint32_t version = file.read_u32();
- // Read n_embd_v_gqa
- uint32_t n_embd_v_gqa_ref;
- memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
- inp += sizeof(n_embd_v_gqa_ref);
+ if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
+ LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+ return false;
+ }
+ }
- // Sanity check model compatibility
- const auto & hparams = ctx->model.hparams;
- const uint32_t n_layer = hparams.n_layer;
+ // load the prompt
+ {
+ const uint32_t n_token_count = file.read_u32();
- if (n_layer != n_layer_ref) {
- LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
- return 0;
- }
+ if (n_token_count > n_token_capacity) {
+ LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+ return false;
+ }
- if (hparams.n_embd_v_gqa() != n_embd_v_gqa_ref) {
- LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, hparams.n_embd_v_gqa(), n_embd_v_gqa_ref);
- return 0;
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+ *n_token_count_out = n_token_count;
}
- // Allocate the new cells for the slot
- if (cell_count) {
- llama_batch batch = llama_batch_init(cell_count, 0, 1);
- batch.n_tokens = cell_count;
- for (uint32_t i = 0; i < cell_count; ++i) {
- llama_pos pos;
- memcpy(&pos, inp, sizeof(pos));
- inp += sizeof(pos);
+ // restore the context state
+ {
+ const size_t n_state_size_cur = file.size - file.tell();
+
+ llama_data_read_file data_ctx(&file);
+ const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
- batch.pos[i] = pos;
- batch.n_seq_id[i] = 1;
- batch.seq_id[i][0] = dest_seq_id;
- }
- if (!llama_kv_cache_find_slot(kv_self, batch)) {
- llama_batch_free(batch);
- LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
- return 0;
+ if (n_read != n_state_size_cur) {
+ LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
+ return false;
}
+ }
+ return true;
+}
- // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
- // Assume that this is one contiguous block of cells
- GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
- GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
- GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
- GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
- GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
-
- // Cleanup
- llama_batch_free(batch);
+bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+ try {
+ return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
+ return false;
}
+}
- const uint32_t kv_size = kv_self.size;
- const uint32_t kv_head = kv_self.head;
-
- // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
- for (int il = 0; il < (int)n_layer; ++il) {
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
- // Read type of key
- int32_t k_type_i_ref;
- memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
- inp += sizeof(k_type_i_ref);
- const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
- if (k_type_i != k_type_i_ref) {
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
- LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
- return 0;
- }
+static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+ llama_file file(path_session, "wb");
- // Read row size of key
- size_t k_size_row_ref;
- memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
- inp += sizeof(k_size_row_ref);
- const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
- if (k_size_row != k_size_row_ref) {
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
- LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
- return 0;
- }
+ file.write_u32(LLAMA_SESSION_MAGIC);
+ file.write_u32(LLAMA_SESSION_VERSION);
- if (cell_count) {
- // Read and set the keys for the whole cell range
- ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
- inp += cell_count * k_size_row;
- }
+ // save the prompt
+ file.write_u32((uint32_t) n_token_count);
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+ // save the context state using stream saving
+ llama_data_write_file data_ctx(&file);
+ llama_state_get_data_internal(ctx, data_ctx);
+
+ return true;
+}
+
+bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+ try {
+ return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
+ return false;
}
+}
- // TODO: simplify, reduce copy-paste
- if (!kv_self.v_trans) {
- for (int il = 0; il < (int)n_layer; ++il) {
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
+ llama_synchronize(ctx);
- // Read type of value
- int32_t v_type_i_ref;
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
- inp += sizeof(v_type_i_ref);
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
- if (v_type_i != v_type_i_ref) {
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
- return 0;
- }
+ data_ctx.write_kv_cache(ctx, seq_id);
- // Read row size of value
- size_t v_size_row_ref;
- memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
- inp += sizeof(v_size_row_ref);
- const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
- if (v_size_row != v_size_row_ref) {
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
- LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
- return 0;
- }
+ return data_ctx.get_size_written();
+}
- if (cell_count) {
- // Read and set the values for the whole cell range
- ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
- inp += cell_count * v_size_row;
- }
- }
- } else {
- // For each layer, read the values for each cell (transposed)
- for (int il = 0; il < (int)n_layer; ++il) {
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
- // Read type of value
- int32_t v_type_i_ref;
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
- inp += sizeof(v_type_i_ref);
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
- if (v_type_i != v_type_i_ref) {
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
- return 0;
- }
-
- // Read element size of value
- size_t v_size_el_ref;
- memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
- inp += sizeof(v_size_el_ref);
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
- if (v_size_el != v_size_el_ref) {
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
- LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
- return 0;
- }
+size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
+ llama_data_write_dummy data_ctx;
+ return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+}
- if (cell_count) {
- // For each row in the transposed matrix, read the values for the whole cell range
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
- const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
- ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
- inp += cell_count * v_size_el;
- }
- }
- }
+size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
+ llama_data_write_buffer data_ctx(dst, size);
+ try {
+ return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
+ return 0;
}
+}
- const size_t nread = inp - src;
+static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
+ llama_synchronize(ctx);
+
+ data_ctx.read_kv_cache(ctx, dest_seq_id);
- return nread;
+ return data_ctx.get_size_read();
+}
+
+size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
+ llama_data_read_buffer data_ctx(src, size);
+ try {
+ return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
+ } catch (const std::exception & err) {
+ LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
+ return 0;
+ }
}
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
@@ -18384,11 +18526,11 @@ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, con
file.write_u32(LLAMA_STATE_SEQ_VERSION);
// save the prompt
- file.write_u32((uint32_t)n_token_count);
+ file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
// save the context state using stream saving
- llama_data_file_context data_ctx(&file);
+ llama_data_write_file data_ctx(&file);
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
const size_t res = file.tell();
@@ -18426,9 +18568,8 @@ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, con
// restore the context state
{
const size_t state_size = file.size - file.tell();
- std::vector<uint8_t> state_data(state_size);
- file.read_raw(state_data.data(), state_size);
- const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
+ llama_data_read_file data_ctx(&file);
+ const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
if (!nread) {
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
return 0;
@@ -18444,7 +18585,7 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa
try {
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
} catch (const std::exception & err) {
- LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
+ LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
return 0;
}
}
@@ -18453,7 +18594,7 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
try {
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
} catch (const std::exception & err) {
- LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
+ LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
return 0;
}
}
@@ -18625,7 +18766,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
#endif
return nullptr;
}
@@ -18670,7 +18811,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
#endif
return nullptr;
}