diff options
author | Giuseppe Scrivano <giuseppe@scrivano.org> | 2024-05-28 20:49:49 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-28 21:49:49 +0300 |
commit | 5442939fcc5e6ae41abf40612a95fd71377e487e (patch) | |
tree | 1402af1bf61b8a110252b748b9d453e09946d5cf /llama.cpp | |
parent | 56411a950f255b523a9edd684fd1632752474399 (diff) |
llama : support small Granite models (#7481)
* Add optional MLP bias for Granite models
Add optional MLP bias for ARCH_LLAMA to support Granite models.
Partially addresses ggerganov/llama.cpp/issues/7116
Still needs some more changes to properly support Granite.
* llama: honor add_space_prefix from the model configuration
propagate the add_space_prefix configuration from the HF model
configuration to the gguf file and honor it with the gpt2 tokenizer.
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
* llama: add support for small granite models
it works only for the small models 3b and 8b.
The convert-hf-to-gguf.py script uses the vocabulary size of the
granite models to detect granite and set the correct configuration.
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---------
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
Co-authored-by: Steffen Roecker <sroecker@redhat.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 27 |
1 files changed, 21 insertions, 6 deletions
@@ -2028,8 +2028,9 @@ struct llama_layer { struct ggml_tensor * ffn_up_shexp; // ff bias - struct ggml_tensor * ffn_down_b; // b2 - struct ggml_tensor * ffn_up_b; // b3 + struct ggml_tensor * ffn_gate_b = nullptr; + struct ggml_tensor * ffn_down_b = nullptr; // b2 + struct ggml_tensor * ffn_up_b = nullptr; // b3 struct ggml_tensor * ffn_act; // mamba proj @@ -4058,7 +4059,9 @@ static void llm_load_hparams( switch (hparams.n_layer) { case 22: model.type = e_model::MODEL_1B; break; case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break; + // granite uses a vocab with len 49152 + case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break; + case 36: model.type = e_model::MODEL_8B; break; // granite case 40: model.type = e_model::MODEL_13B; break; case 48: model.type = e_model::MODEL_34B; break; case 60: model.type = e_model::MODEL_30B; break; @@ -4328,6 +4331,8 @@ static void llm_load_hparams( case 30: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_15B; break; + case 52: model.type = e_model::MODEL_20B; break; // granite + case 88: model.type = e_model::MODEL_34B; break; // granite default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -4590,6 +4595,11 @@ static void llm_load_vocab( } else { if (tokenizer_model == "gpt2") { vocab.type = LLAMA_VOCAB_TYPE_BPE; + + const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); + if (add_space_prefix_keyidx != -1) { + vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); + } } else { LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str()); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); @@ -5211,6 +5221,11 @@ static bool llm_load_tensors( layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + // optional MLP bias + layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); } else { layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); @@ -7483,9 +7498,9 @@ struct llm_build_context { cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_gate, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); |