summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-05-20 19:35:28 +0300
committerGitHub <noreply@github.com>2024-05-21 02:35:28 +1000
commitfabf30b4c4fca32e116009527180c252919ca922 (patch)
tree50b57bc259b9efa9d6a354ac420b70c608bca4ab
parent20385cebcc4bb3f6dd10f989573c11864d70d53d (diff)
llama : remove Persimmon (#7408)
* llama : remove Persimmon * requirements : remove
-rw-r--r--README.md1
-rwxr-xr-xconvert-hf-to-gguf.py39
-rwxr-xr-xconvert-persimmon-to-gguf.py143
-rw-r--r--gguf-py/gguf/constants.py19
-rw-r--r--llama.cpp280
-rw-r--r--requirements.txt1
-rw-r--r--requirements/requirements-convert-persimmon-to-gguf.txt2
7 files changed, 0 insertions, 485 deletions
diff --git a/README.md b/README.md
index 47d41ebf..f4088c05 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,6 @@ Typically finetunes of the base models below are supported as well.
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
-- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index bd303150..d534b516 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1148,45 +1148,6 @@ class RefactModel(Model):
return tensors
-@Model.register("PersimmonForCausalLM")
-class PersimmonModel(Model):
- model_arch = gguf.MODEL_ARCH.PERSIMMON
-
- def set_gguf_parameters(self):
- block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
- head_count = self.hparams["num_attention_heads"]
- head_count_kv = head_count
- hidden_size = self.hparams["hidden_size"]
-
- self.gguf_writer.add_name('persimmon-8b-chat')
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
- self.gguf_writer.add_embedding_length(hidden_size)
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-
- # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
- # than the head size?
- # ref: https://github.com/ggerganov/llama.cpp/pull/4889
- # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
- self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
-
- self.gguf_writer.add_head_count(head_count)
- self.gguf_writer.add_head_count_kv(head_count_kv)
- self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
-
- def set_vocab(self):
- self._set_vocab_sentencepiece()
- # self.gguf_writer.add_bos_token_id(71013)
- # self.gguf_writer.add_eos_token_id(71013)
-
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
- del name, new_name, bid, n_dims # unused
-
- # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
- return True
-
-
@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
class StableLMModel(Model):
model_arch = gguf.MODEL_ARCH.STABLELM
diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
deleted file mode 100755
index 07dcade7..00000000
--- a/convert-persimmon-to-gguf.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import logging
-import argparse
-import os
-import sys
-from pathlib import Path
-from pprint import pprint
-
-import torch
-from sentencepiece import SentencePieceProcessor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-logger = logging.getLogger("persimmon-to-gguf")
-
-
-def _flatten_dict(dct, tensors, prefix=None):
- assert isinstance(dct, dict)
- for key in dct.keys():
- new_prefix = prefix + '.' + key if prefix is not None else key
- if isinstance(dct[key], torch.Tensor):
- tensors[new_prefix] = dct[key]
- elif isinstance(dct[key], dict):
- _flatten_dict(dct[key], tensors, new_prefix)
- else:
- raise ValueError(type(dct[key]))
- return None
-
-
-def _get_sentencepiece_tokenizer_info(dir_model: Path):
- tokenizer_path = dir_model / 'adept_vocab.model'
- logger.info('getting sentencepiece tokenizer from', tokenizer_path)
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
- logger.info('adding tokens')
- tokens: list[bytes] = []
- scores: list[float] = []
- toktypes: list[int] = []
-
- for i in range(tokenizer.vocab_size()):
- text: bytes
- score: float
-
- piece = tokenizer.id_to_piece(i)
- text = piece.encode("utf-8")
- score = tokenizer.get_score(i)
-
- toktype = 1
- if tokenizer.is_unknown(i):
- toktype = 2
- if tokenizer.is_control(i):
- toktype = 3
- if tokenizer.is_unused(i):
- toktype = 5
- if tokenizer.is_byte(i):
- toktype = 6
-
- tokens.append(text)
- scores.append(score)
- toktypes.append(toktype)
- pass
- return tokens, scores, toktypes
-
-
-def main():
- parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
- parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
- parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
- parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
- parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
- parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
- args = parser.parse_args()
- logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
- sys.path.append(str(args.adept_inference_dir))
- persimmon_model = torch.load(args.ckpt_path)
- hparams = persimmon_model['args']
- pprint(hparams)
- tensors: dict[str, torch.Tensor] = {}
- _flatten_dict(persimmon_model['model'], tensors, None)
-
- arch = gguf.MODEL_ARCH.PERSIMMON
- gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
-
- block_count = hparams.num_layers
- head_count = hparams.num_attention_heads
- head_count_kv = head_count
- ctx_length = hparams.seq_length
- hidden_size = hparams.hidden_size
-
- gguf_writer.add_name('persimmon-8b-chat')
- gguf_writer.add_context_length(ctx_length)
- gguf_writer.add_embedding_length(hidden_size)
- gguf_writer.add_block_count(block_count)
- gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
- # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
- gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
- gguf_writer.add_head_count(head_count)
- gguf_writer.add_head_count_kv(head_count_kv)
- gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
- gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
-
- tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
- gguf_writer.add_tokenizer_model('llama')
- gguf_writer.add_tokenizer_pre('default')
- gguf_writer.add_token_list(tokens)
- gguf_writer.add_token_scores(scores)
- gguf_writer.add_token_types(toktypes)
- gguf_writer.add_bos_token_id(71013)
- gguf_writer.add_eos_token_id(71013)
-
- tensor_map = gguf.get_tensor_name_map(arch, block_count)
- logger.info(tensor_map)
- for name in tensors.keys():
- data_torch = tensors[name]
- if name.endswith(".self_attention.rotary_emb.inv_freq"):
- continue
- old_dtype = data_torch.dtype
- # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
- data = data_torch.to(torch.float32).squeeze().numpy()
- new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
- if new_name is None:
- raise ValueError(f"Can not map tensor '{name}'")
-
- n_dims = len(data.shape)
- logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
- gguf_writer.add_tensor(new_name, data)
- logger.info("gguf: write header")
- gguf_writer.write_header_to_file()
- logger.info("gguf: write metadata")
- gguf_writer.write_kv_data_to_file()
- logger.info("gguf: write tensors")
- gguf_writer.write_tensors_to_file()
-
- gguf_writer.close()
-
- logger.info(f"gguf: model successfully exported to '{args.outfile}'")
-
-
-if __name__ == '__main__':
- main()
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 978fcada..692120f4 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -115,7 +115,6 @@ class MODEL_ARCH(IntEnum):
GPTNEOX = auto()
MPT = auto()
STARCODER = auto()
- PERSIMMON = auto()
REFACT = auto()
BERT = auto()
NOMIC_BERT = auto()
@@ -193,7 +192,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder",
- MODEL_ARCH.PERSIMMON: "persimmon",
MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert",
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
@@ -426,20 +424,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
- MODEL_ARCH.PERSIMMON: [
- MODEL_TENSOR.TOKEN_EMBD,
- MODEL_TENSOR.OUTPUT,
- MODEL_TENSOR.OUTPUT_NORM,
- MODEL_TENSOR.ATTN_NORM,
- MODEL_TENSOR.ATTN_QKV,
- MODEL_TENSOR.ATTN_OUT,
- MODEL_TENSOR.FFN_NORM,
- MODEL_TENSOR.FFN_DOWN,
- MODEL_TENSOR.FFN_UP,
- MODEL_TENSOR.ATTN_Q_NORM,
- MODEL_TENSOR.ATTN_K_NORM,
- MODEL_TENSOR.ATTN_ROT_EMBD,
- ],
MODEL_ARCH.REFACT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -756,9 +740,6 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
- MODEL_ARCH.PERSIMMON: [
- MODEL_TENSOR.ROPE_FREQS,
- ],
MODEL_ARCH.QWEN: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
diff --git a/llama.cpp b/llama.cpp
index 2025e455..863961f1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -202,7 +202,6 @@ enum llm_arch {
LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
- LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT,
LLM_ARCH_BERT,
LLM_ARCH_NOMIC_BERT,
@@ -239,7 +238,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" },
- { LLM_ARCH_PERSIMMON, "persimmon" },
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
@@ -596,23 +594,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
},
},
{
- LLM_ARCH_PERSIMMON,
- {
- { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
- { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
- { LLM_TENSOR_OUTPUT, "output"},
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
- { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
- { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
- { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
- },
- },
- {
LLM_ARCH_MPT,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
@@ -3967,14 +3948,6 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
- case LLM_ARCH_PERSIMMON:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 36: model.type = e_model::MODEL_8B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
- }
- } break;
case LLM_ARCH_REFACT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5221,47 +5194,6 @@ static bool llm_load_tensors(
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
- case LLM_ARCH_PERSIMMON:
- {
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
- {
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
- model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
- }
-
- for (int i = 0; i < n_layer; ++i) {
- ggml_context * ctx_layer = ctx_for_layer(i);
- ggml_context * ctx_split = ctx_for_layer_split(i);
-
- auto & layer = model.layers[i];
-
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
-
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
-
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
-
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
-
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
-
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
-
- layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
- layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
-
- layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
- layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
- }
- } break;
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
{
@@ -7923,213 +7855,6 @@ struct llm_build_context {
return gf;
}
- struct ggml_cgraph * build_persimmon() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
-
- struct ggml_tensor * cur;
- struct ggml_tensor * inpL;
-
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
-
- // inp_pos - contains the positions
- struct ggml_tensor * inp_pos = build_inp_pos();
-
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * residual = inpL;
-
- cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, cb, il);
- cb(cur, "attn_norm", il);
-
- // self attention
- {
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
-
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
-
- // split qkv
- GGML_ASSERT(n_head_kv == n_head);
-
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
- cb(tmpqkv, "tmpqkv", il);
-
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
- cb(tmpqkv_perm, "tmpqkv", il);
-
- struct ggml_tensor * tmpq = ggml_view_3d(
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
- ggml_element_size(tmpqkv_perm) * n_embd_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
- 0
- );
- cb(tmpq, "tmpq", il);
-
- struct ggml_tensor * tmpk = ggml_view_3d(
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
- ggml_element_size(tmpqkv_perm) * n_embd_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
- );
- cb(tmpk, "tmpk", il);
-
- // Q/K Layernorm
- tmpq = llm_build_norm(ctx0, tmpq, hparams,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, cb, il);
- cb(tmpq, "tmpq", il);
-
- tmpk = llm_build_norm(ctx0, tmpk, hparams,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, cb, il);
- cb(tmpk, "tmpk", il);
-
- // RoPE the first n_rot of q/k, pass the other half, and concat.
- struct ggml_tensor * qrot = ggml_view_3d(
- ctx0, tmpq, n_rot, n_head, n_tokens,
- ggml_element_size(tmpq) * n_embd_head,
- ggml_element_size(tmpq) * n_embd_head * n_head,
- 0
- );
- cb(qrot, "qrot", il);
-
- struct ggml_tensor * krot = ggml_view_3d(
- ctx0, tmpk, n_rot, n_head, n_tokens,
- ggml_element_size(tmpk) * n_embd_head,
- ggml_element_size(tmpk) * n_embd_head * n_head,
- 0
- );
- cb(krot, "krot", il);
-
- // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
- struct ggml_tensor * qpass = ggml_view_3d(
- ctx0, tmpq, n_rot, n_head, n_tokens,
- ggml_element_size(tmpq) * n_embd_head,
- ggml_element_size(tmpq) * n_embd_head * n_head,
- ggml_element_size(tmpq) * n_rot
- );
- cb(qpass, "qpass", il);
-
- struct ggml_tensor * kpass = ggml_view_3d(
- ctx0, tmpk, n_rot, n_head, n_tokens,
- ggml_element_size(tmpk) * n_embd_head,
- ggml_element_size(tmpk) * n_embd_head * n_head,
- ggml_element_size(tmpk) * n_rot
- );
- cb(kpass, "kpass", il);
-
- struct ggml_tensor * qrotated = ggml_rope_custom(
- ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(qrotated, "qrotated", il);
-
- struct ggml_tensor * krotated = ggml_rope_custom(
- ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(krotated, "krotated", il);
-
- // ggml currently only supports concatenation on dim=2
- // so we need to permute qrot, qpass, concat, then permute back.
- qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
- cb(qrotated, "qrotated", il);
-
- krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
- cb(krotated, "krotated", il);
-
- qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
- cb(qpass, "qpass", il);
-
- kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
- cb(kpass, "kpass", il);
-
- struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
- cb(Qcur, "Qcur", il);
-
- struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
- cb(Kcur, "Kcur", il);
-
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
- cb(Q, "Q", il);
-
- Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
- cb(Kcur, "Kcur", il);
-
- struct ggml_tensor * Vcur = ggml_view_3d(
- ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
- ggml_element_size(tmpqkv_perm) * n_embd_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
- ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
- );
- cb(Vcur, "Vcur", il);
-
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
- model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
- }
-
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
- }
-
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
- cb(ffn_inp, "ffn_inp", il);
-
- // feed-forward network
- {
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, cb, il);
- cb(cur, "ffn_norm", il);
-
- cur = llm_build_ffn(ctx0, cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
- NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
- NULL,
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
- cb(cur, "ffn_out", il);
- }
-
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "l_out", il);
-
- inpL = cur;
- }
-
- cur = inpL;
-
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, cb, -1);
- cb(cur, "result_norm", -1);
-
- cur = ggml_mul_mat(ctx0, model.output, cur);
- cb(cur, "result_output", -1);
-
- ggml_build_forward_expand(gf, cur);
-
- return gf;
- }
-
struct ggml_cgraph * build_refact() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -10898,10 +10623,6 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_starcoder();
} break;
- case LLM_ARCH_PERSIMMON:
- {
- result = llm.build_persimmon();
- } break;
case LLM_ARCH_REFACT:
{
result = llm.build_refact();
@@ -15992,7 +15713,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_FALCON:
case LLM_ARCH_GROK:
case LLM_ARCH_DBRX:
- case LLM_ARCH_PERSIMMON:
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_STABLELM:
diff --git a/requirements.txt b/requirements.txt
index e7d14e16..43f82dc2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,3 @@
-r ./requirements/requirements-convert-hf-to-gguf.txt
-r ./requirements/requirements-convert-hf-to-gguf-update.txt
-r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
--r ./requirements/requirements-convert-persimmon-to-gguf.txt
diff --git a/requirements/requirements-convert-persimmon-to-gguf.txt b/requirements/requirements-convert-persimmon-to-gguf.txt
deleted file mode 100644
index 6ac40261..00000000
--- a/requirements/requirements-convert-persimmon-to-gguf.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r ./requirements-convert.txt
-torch~=2.1.1