summaryrefslogtreecommitdiff
path: root/tests/test-tokenizer-random.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test-tokenizer-random.py')
-rw-r--r--tests/test-tokenizer-random.py20
1 files changed, 13 insertions, 7 deletions
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 7e1b656e..ec1b2837 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -167,8 +167,10 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
for m in range(iterations):
rand.seed(m)
words = rand.choices(special_tokens, k=500)
- if tokenizer.add_bos_token: # skip spam warning of double BOS
- while words and words[0] == tokenizer.bos_token:
+ if words[0] == tokenizer.bos_token: # skip spam warning of double BOS
+ while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
+ words.pop(0)
+ if tokenizer.add_bos_token: # drop all starting BOS
words.pop(0)
yield "".join(words)
@@ -293,15 +295,17 @@ def main(argv: list[str] = None):
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
- tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", True)
- tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", False)
-
def func_tokenize1(text: str):
return model.tokenize(text, add_special=True, parse_special=True)
def func_tokenize2(text: str):
return tokenizer.encode(text, add_special_tokens=True)
+ ids = func_tokenize2("a")
+ assert 1 <= len(ids) <= 3
+ add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0]
+ tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token)
+
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
@@ -324,8 +328,10 @@ if __name__ == "__main__":
# import os
# tokenizers = os.listdir(path_tokenizers)
tokenizers = [
- "llama-spm", # SPM
- "phi-3", # SPM
+ # "llama-spm", # SPM
+ # "phi-3", # SPM
+ "jina-v2-en", # WPM
+ "bert-bge", # WPM
]
for tokenizer in tokenizers: