Add test for MPT tokenization (#3728)

* Add test for MPT tokenization * Revert code motion * Remove unnecessary restriction in test case * Clarify logic in conversion
author: goerch <jhr.walter@t-online.de> 2023-10-22 21:21:42 +0200
committer: GitHub <noreply@github.com> 2023-10-22 21:21:42 +0200
commit: 9e70cc03229df19ca2d28ce23cc817198f897278 (patch)
tree: 0c027b73d2efc94260b41e2227a1318e2c9ba23d /convert-mpt-hf-to-gguf.py
parent: 5a42a5f8e8a86da9ac88008d748cf232a83aa0e1 (diff)
1 files changed, 11 insertions, 4 deletions
diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py
index 21b9fd50..2d2fa232 100755
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@@ -128,15 +128,22 @@ vocab_size = hparams["vocab_size"]
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 
+added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        # NOTE: wouldn't we like to distinguish CONTROL tokens here?
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
 
 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
author	goerch <jhr.walter@t-online.de>	2023-10-22 21:21:42 +0200
committer	GitHub <noreply@github.com>	2023-10-22 21:21:42 +0200
commit	9e70cc03229df19ca2d28ce23cc817198f897278 (patch)
tree	0c027b73d2efc94260b41e2227a1318e2c9ba23d /convert-mpt-hf-to-gguf.py
parent	5a42a5f8e8a86da9ac88008d748cf232a83aa0e1 (diff)