diff options
Diffstat (limited to 'tests')
-rw-r--r-- | tests/test-tokenizer-0-falcon.py | 56 | ||||
-rw-r--r-- | tests/test-tokenizer-0-llama.py | 52 |
2 files changed, 54 insertions, 54 deletions
diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py index 65e1c0db..4f06ec9b 100644 --- a/tests/test-tokenizer-0-falcon.py +++ b/tests/test-tokenizer-0-falcon.py @@ -14,34 +14,34 @@ dir_tokenizer = args.dir_tokenizer tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is π¦.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅ΡΠΎ Π½Π° ΠΡΠ»Π³Π°ΡΡΠΊΠΈ", - "ααΆαααααα·αααα’αΆα
ααα
αα", - "π (normal) πΆβπ«οΈ (multiple emojis concatenated) β
(only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - "\n =", - "' era", - ] + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is π¦.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅ΡΠΎ Π½Π° ΠΡΠ»Π³Π°ΡΡΠΊΠΈ", + "ααΆαααααα·αααα’αΆα
ααα
αα", + "π (normal) πΆβπ«οΈ (multiple emojis concatenated) β
(only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + "\n =", + "' era", +] for text in tests: print('text: ', text) diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py index 21df8e6e..f3d4d7e3 100644 --- a/tests/test-tokenizer-0-llama.py +++ b/tests/test-tokenizer-0-llama.py @@ -14,32 +14,32 @@ dir_tokenizer = args.dir_tokenizer tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is π¦.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅ΡΠΎ Π½Π° ΠΡΠ»Π³Π°ΡΡΠΊΠΈ", - "ααΆαααααα·αααα’αΆα
ααα
αα", - "π (normal) πΆβπ«οΈ (multiple emojis concatenated) β
(only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - ] + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is π¦.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅ΡΠΎ Π½Π° ΠΡΠ»Π³Π°ΡΡΠΊΠΈ", + "ααΆαααααα·αααα’αΆα
ααα
αα", + "π (normal) πΆβπ«οΈ (multiple emojis concatenated) β
(only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", +] for text in tests: |