summaryrefslogtreecommitdiff
path: root/convert-hf-to-gguf.py
diff options
context:
space:
mode:
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-xconvert-hf-to-gguf.py27
1 files changed, 9 insertions, 18 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index c5d2d0b7..918a90e5 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
-from convert import HfVocab
+from convert import LlamaHfVocab
###### MODEL DEFINITIONS ######
@@ -230,7 +230,7 @@ class Model(ABC):
def _set_vocab_gpt2(self):
dir_model = self.dir_model
hparams = self.hparams
- tokens: list[bytearray] = []
+ tokens: list[str] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
@@ -243,8 +243,7 @@ class Model(ABC):
for i in range(vocab_size):
if i not in reverse_vocab:
- pad_token = f"[PAD{i}]".encode('utf-8')
- tokens.append(bytearray(pad_token))
+ tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
@@ -266,7 +265,7 @@ class Model(ABC):
def _set_vocab_qwen(self):
dir_model = self.dir_model
hparams = self.hparams
- tokens: list[bytearray] = []
+ tokens: list[str] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
@@ -291,8 +290,7 @@ class Model(ABC):
for i in range(vocab_size):
if i not in reverse_vocab:
- pad_token = f"[PAD{i}]".encode("utf-8")
- tokens.append(bytearray(pad_token))
+ tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
@@ -372,12 +370,8 @@ class Model(ABC):
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
- def _set_vocab_hf(self):
- path = self.dir_model
- added_tokens_path = self.dir_model
- vocab = HfVocab(
- path, added_tokens_path if added_tokens_path.exists() else None
- )
+ def _set_vocab_llama_hf(self):
+ vocab = LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
@@ -1099,7 +1093,7 @@ class MiniCPMModel(Model):
self.gguf_writer.add_file_type(self.ftype)
def set_vocab(self):
- self._set_vocab_hf()
+ self._set_vocab_llama_hf()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
@@ -1700,11 +1694,8 @@ class BertModel(Model):
self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self):
- path = self.dir_model
- added_tokens_path = self.dir_model if self.dir_model.exists() else None
-
# use huggingface vocab to get all tokens
- vocab = HfVocab(path, added_tokens_path)
+ vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size
self.vocab_size = vocab.vocab_size