diff options
author | jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> | 2024-05-20 20:15:57 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-20 20:15:57 +0200 |
commit | 917dc8cfa67a72fb7c8bf7392270da3bf4833af4 (patch) | |
tree | 170c39ba79ae72036105970d92548da420699ba1 /convert-hf-to-gguf.py | |
parent | fabf30b4c4fca32e116009527180c252919ca922 (diff) |
Tokenizer SPM fixes for phi-3 and llama-spm (#7375)
* Update brute force test: special tokens
* Fix added tokens
- Try to read 'added_tokens.json'.
- Try to read 'tokenizer_config.json'.
- Try to read 'tokenizer.json'.
* Fix special tokens rtrim
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* server : fix test regexes
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-x | convert-hf-to-gguf.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d534b516..8937a498 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1740,6 +1740,38 @@ class Phi3MiniModel(Model): scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + assert(tokens[token_id] == token) + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + assert(tokens[token_id] == token) + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) |