summaryrefslogtreecommitdiff
path: root/convert-hf-to-gguf.py
diff options
context:
space:
mode:
authorjaime-m-p <167997752+jaime-m-p@users.noreply.github.com>2024-05-20 20:15:57 +0200
committerGitHub <noreply@github.com>2024-05-20 20:15:57 +0200
commit917dc8cfa67a72fb7c8bf7392270da3bf4833af4 (patch)
tree170c39ba79ae72036105970d92548da420699ba1 /convert-hf-to-gguf.py
parentfabf30b4c4fca32e116009527180c252919ca922 (diff)
Tokenizer SPM fixes for phi-3 and llama-spm (#7375)
* Update brute force test: special tokens * Fix added tokens - Try to read 'added_tokens.json'. - Try to read 'tokenizer_config.json'. - Try to read 'tokenizer.json'. * Fix special tokens rtrim Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server : fix test regexes
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-xconvert-hf-to-gguf.py32
1 files changed, 32 insertions, 0 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index d534b516..8937a498 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1740,6 +1740,38 @@ class Phi3MiniModel(Model):
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+ if tokenizer_config_file.is_file():
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+ tokenizer_config_json = json.load(f)
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
+ for token_id, foken_data in added_tokens_decoder.items():
+ token_id = int(token_id)
+ token = foken_data["content"].encode("utf-8")
+ if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+ assert(tokens[token_id] == token)
+ tokens[token_id] = token
+ scores[token_id] = -1000.0
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+ if foken_data.get("special"):
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
+ tokenizer_file = self.dir_model / 'tokenizer.json'
+ if tokenizer_file.is_file():
+ with open(tokenizer_file, "r", encoding="utf-8") as f:
+ tokenizer_json = json.load(f)
+ added_tokens = tokenizer_json.get("added_tokens", [])
+ for foken_data in added_tokens:
+ token_id = int(foken_data["id"])
+ token = foken_data["content"].encode("utf-8")
+ if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+ assert(tokens[token_id] == token)
+ tokens[token_id] = token
+ scores[token_id] = -1000.0
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+ if foken_data.get("special"):
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)