summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorcompilade <113953597+compilade@users.noreply.github.com>2024-01-22 06:21:52 -0500
committerGitHub <noreply@github.com>2024-01-22 13:21:52 +0200
commitd6bd4d46ddb6926087c11e0f6633ab1c81da58c3 (patch)
treec75bf322368750ef332c1b981b34f9fd26e6f084 /llama.cpp
parent152d9d05e097e35f1cac21262946d57faec7542a (diff)
llama : support StableLM 2 1.6B (#5052)
* llama : support StableLM 2 1.6B * convert : fix Qwen's set_vocab wrongly naming all special tokens [PAD{id}] * convert : refactor Qwen's set_vocab to use it for StableLM 2 too * nix : add tiktoken to llama-python-extra * convert : use presence of tokenizer.json to determine StableLM tokenizer loader It's a less arbitrary heuristic than the vocab size.
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp18
1 files changed, 18 insertions, 0 deletions
diff --git a/llama.cpp b/llama.cpp
index c56d3116..8c906a22 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2877,6 +2877,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
+ case 24: model.type = e_model::MODEL_1B; break;
case 32: model.type = e_model::MODEL_3B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
@@ -3700,6 +3701,11 @@ static bool llm_load_tensors(
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ // optional bias tensors, present in Stable LM 2 1.6B
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
@@ -5598,12 +5604,24 @@ struct llm_build_context {
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,