From b43272afa29a64dcb8bcf26a96a05bac40792b92 Mon Sep 17 00:00:00 2001 From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> Date: Sat, 18 May 2024 01:09:13 +0200 Subject: Unicode codepoint flags for custom regexs (#7245) * Replace CODEPOINT_TYPE_* with codepoint_flags * Update and bugfix brute force random test * Deterministic brute force random test * Unicode normalization NFD * Get rid of BOM --- tests/test-tokenizer-random.py | 113 ++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 53 deletions(-) (limited to 'tests/test-tokenizer-random.py') diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 5b2ab8ef..d5a6f185 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -1,5 +1,5 @@ # Test libllama tokenizer == AutoTokenizer. -# Brute force random tokens/text generation. +# Brute force random words/text generation. # # Sample usage: # @@ -12,10 +12,10 @@ import argparse import subprocess import random -from typing import Iterator +from typing import Callable, Iterator import cffi -from transformers import AutoTokenizer, PreTrainedTokenizerBase +from transformers import AutoTokenizer logger = logging.getLogger("test-tokenizer-random-bpe") @@ -145,28 +145,35 @@ def generator_custom_text() -> Iterator[str]: def generator_custom_text_edge_cases() -> Iterator[str]: """Edge cases found while debugging""" yield from [ - '\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F} - '¼-a', # unicode_ranges_digit, 0x00BC - '½-a', # unicode_ranges_digit, 0x00BD - '¾-a', # unicode_ranges_digit, 0x00BE - 'a 〇b', # unicode_ranges_digit, 0x3007 - 'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms - '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM) - 'a' # TODO: Phi-3 fail + '\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F} + '¼-a', # unicode_ranges_digit, 0x00BC + '½-a', # unicode_ranges_digit, 0x00BD + '¾-a', # unicode_ranges_digit, 0x00BE + 'a 〇b', # unicode_ranges_digit, 0x3007 + 'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms + '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM) + 'Cửa Việt', # llama-3, ignore_merges = true + 'a', # TODO: Phi-3 fail + 'a\na', # TODO: Bert fail ] -def generator_random_chars(iterations = 100) -> Iterator[str]: +def generator_vocab_words(vocab: list[str]) -> Iterator[str]: + """Brute force check all vocab words""" + yield from vocab + + +def generator_random_chars(iterations=100) -> Iterator[str]: """Brute force random text with simple characters""" WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5) - CHARS = list(set(""" + CHARS = list(sorted(set(""" ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ áéíóúàèìòùâêîôûäëïöü .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_ - """)) + """))) rand = random.Random() for m in range(iterations): @@ -181,13 +188,13 @@ def generator_random_chars(iterations = 100) -> Iterator[str]: yield "".join(text) -def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]: +def generator_random_vocab_chars(vocab: list[str], iterations=100) -> Iterator[str]: """Brute force random text with vocab characters""" - vocab_ids = list(tokenizer.vocab.values()) - vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True) - vocab_chars = list(set(vocab_text)) - del vocab_ids, vocab_text + vocab_chars = set() + for word in vocab: + vocab_chars.update(word) + vocab_chars = list(sorted(vocab_chars)) rand = random.Random() for m in range(iterations): @@ -196,19 +203,11 @@ def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations yield "".join(text) -def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]: - """Brute force random text from vocab tokens""" - - space_id = tokenizer.encode(" ", add_special_tokens=False)[0] - vocab_ids = list(tokenizer.vocab.values()) - vocab_ids = list(sorted(vocab_ids + vocab_ids)) - for i in range(1, len(vocab_ids), 2): - vocab_ids[i] = space_id - vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True) - vocab_tokens = vocab_tokens.split(" ") - del vocab_ids +def generator_random_vocab_words(vocab: list[str], iterations=100) -> Iterator[str]: + """Brute force random text from vocab words""" - yield from vocab_tokens + vocab = [w.strip() for w in vocab] + yield from vocab rand = random.Random() for m in range(iterations): @@ -217,14 +216,13 @@ def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations num_words = rand.randint(300, 400) for i in range(num_words): k = rand.randint(1, 3) - tokens = rand.choices(vocab_tokens, k=k) - tokens = [t.strip(" \n\r\t") for t in tokens] + words = rand.choices(vocab, k=k) sep = rand.choice(" \n\r\t") - text.append("".join(tokens) + sep) + text.append("".join(words) + sep) yield "".join(text) -def generator_random_bytes(iterations = 100) -> Iterator[str]: +def generator_random_bytes(iterations=100) -> Iterator[str]: """Brute force random bytes""" WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5) @@ -242,10 +240,10 @@ def generator_random_bytes(iterations = 100) -> Iterator[str]: yield "".join(text) -def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]): +def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]): def find_first_mismatch(ids1: list[int], ids2: list[int]): - for i, (a,b) in enumerate(zip(ids1, ids2)): + for i, (a, b) in enumerate(zip(ids1, ids2)): if a != b: return i if len(ids1) == len(ids2): @@ -255,15 +253,12 @@ def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerB t0 = time.perf_counter() logger.info("%s: %s" % (generator.__name__, "ini")) for text in generator: - ids1 = model.tokenize(text, add_special=False, parse_special=False) - ids2 = tokenizer.encode(text, add_special_tokens=False) + ids1 = func_tokenize1(text) + ids2 = func_tokenize2(text) if ids1 != ids2: i = find_first_mismatch(ids1, ids2) ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1] ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1] - text2 = tokenizer.decode(ids2, skip_special_tokens=True) - assert (text2 in text) - logger.info(" Text: " + repr(text2)) logger.info(" TokenIDs: " + str(ids1)) logger.info(" Expected: " + str(ids2)) raise Exception() @@ -271,25 +266,37 @@ def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerB logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0)) -if __name__ == "__main__": - +def main(argv: list[str] = None): parser = argparse.ArgumentParser() parser.add_argument("vocab_file", help="path to vocab 'gguf' file") parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - args = parser.parse_args() + args = parser.parse_args(argv) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048)) - + model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096)) tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer) - test_compare_tokenizer(model, tokenizer, generator_custom_text()) - test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases()) - test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000)) - test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000)) - test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000)) - # test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL + def func_tokenize2(text: str): + return tokenizer.encode(text, add_special_tokens=False) + + parse_special = all(len(func_tokenize2(t)) == 1 for t in tokenizer.all_special_tokens) + + def func_tokenize1(text: str): + return model.tokenize(text, add_special=False, parse_special=parse_special) + + vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True))) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text()) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases()) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab)) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000)) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000)) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000)) + # test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL model.free() + + +if __name__ == "__main__": + main() -- cgit v1.2.3