summaryrefslogtreecommitdiff
path: root/convert-mpt-hf-to-gguf.py
diff options
context:
space:
mode:
authorgoerch <jhr.walter@t-online.de>2023-10-22 21:21:42 +0200
committerGitHub <noreply@github.com>2023-10-22 21:21:42 +0200
commit9e70cc03229df19ca2d28ce23cc817198f897278 (patch)
tree0c027b73d2efc94260b41e2227a1318e2c9ba23d /convert-mpt-hf-to-gguf.py
parent5a42a5f8e8a86da9ac88008d748cf232a83aa0e1 (diff)
Add test for MPT tokenization (#3728)
* Add test for MPT tokenization * Revert code motion * Remove unnecessary restriction in test case * Clarify logic in conversion
Diffstat (limited to 'convert-mpt-hf-to-gguf.py')
-rwxr-xr-xconvert-mpt-hf-to-gguf.py15
1 files changed, 11 insertions, 4 deletions
diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py
index 21b9fd50..2d2fa232 100755
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@@ -128,15 +128,22 @@ vocab_size = hparams["vocab_size"]
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
+added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
- tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
- scores.append(0.0) # dummy
- toktypes.append(gguf.TokenType.NORMAL)
+ if i not in reverse_vocab:
+ tokens.append(f"[PAD{i}]")
+ toktypes.append(gguf.TokenType.USER_DEFINED)
+ elif reverse_vocab[i] in added_vocab:
+ # NOTE: wouldn't we like to distinguish CONTROL tokens here?
+ tokens.append(reverse_vocab[i])
+ toktypes.append(gguf.TokenType.USER_DEFINED)
+ else:
+ tokens.append(reverse_vocab[i])
+ toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))