From d7e852c1bc8e85bf62a6f1aede08cd2de723404a Mon Sep 17 00:00:00 2001 From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> Date: Tue, 21 May 2024 14:39:48 +0200 Subject: Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425) * Update brute force test: add_special * Update brute force test: default values for add_bos_token and add_eos_token * Enable rtrim when pre-inserting BOS Co-authored-by: Georgi Gerganov * Revert "server : fix test regexes" --- convert-hf-to-gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'convert-hf-to-gguf.py') diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 8937a498..1acf45bf 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1749,7 +1749,7 @@ class Phi3MiniModel(Model): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: - assert(tokens[token_id] == token) + assert tokens[token_id] == token tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -1765,7 +1765,7 @@ class Phi3MiniModel(Model): token_id = int(foken_data["id"]) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: - assert(tokens[token_id] == token) + assert tokens[token_id] == token tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED -- cgit v1.2.3