diff options
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-x | convert-hf-to-gguf.py | 27 |
1 files changed, 9 insertions, 18 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c5d2d0b7..918a90e5 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf -from convert import HfVocab +from convert import LlamaHfVocab ###### MODEL DEFINITIONS ###### @@ -230,7 +230,7 @@ class Model(ABC): def _set_vocab_gpt2(self): dir_model = self.dir_model hparams = self.hparams - tokens: list[bytearray] = [] + tokens: list[str] = [] toktypes: list[int] = [] from transformers import AutoTokenizer @@ -243,8 +243,7 @@ class Model(ABC): for i in range(vocab_size): if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode('utf-8') - tokens.append(bytearray(pad_token)) + tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) @@ -266,7 +265,7 @@ class Model(ABC): def _set_vocab_qwen(self): dir_model = self.dir_model hparams = self.hparams - tokens: list[bytearray] = [] + tokens: list[str] = [] toktypes: list[int] = [] from transformers import AutoTokenizer @@ -291,8 +290,7 @@ class Model(ABC): for i in range(vocab_size): if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode("utf-8") - tokens.append(bytearray(pad_token)) + tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) @@ -372,12 +370,8 @@ class Model(ABC): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_hf(self): - path = self.dir_model - added_tokens_path = self.dir_model - vocab = HfVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) + def _set_vocab_llama_hf(self): + vocab = LlamaHfVocab(self.dir_model) tokens = [] scores = [] toktypes = [] @@ -1099,7 +1093,7 @@ class MiniCPMModel(Model): self.gguf_writer.add_file_type(self.ftype) def set_vocab(self): - self._set_vocab_hf() + self._set_vocab_llama_hf() def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -1700,11 +1694,8 @@ class BertModel(Model): self.gguf_writer.add_pooling_type(pooling_type) def set_vocab(self): - path = self.dir_model - added_tokens_path = self.dir_model if self.dir_model.exists() else None - # use huggingface vocab to get all tokens - vocab = HfVocab(path, added_tokens_path) + vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True) tokens, scores, toktypes = zip(*vocab.all_tokens()) assert len(tokens) == vocab.vocab_size self.vocab_size = vocab.vocab_size |