diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-05-04 08:32:32 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-04 08:32:32 +0300 |
commit | 92139b90af4841d7fd060b526bdd443b621770ff (patch) | |
tree | 9679c3de1b39970ca73b5bd988c63ddac0359ca6 /models | |
parent | a2ac89d6efb41b535778bfeaecaae8fe295b6ed3 (diff) |
tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)
* tests : add test-tokenizer-0.sh
* unicode : add all unicode number ranges
* starcoder : fix pre-tokenizer
* tests : add test that fails with DeepSeek tokenizers
* falcon : fix regex
* unicode : regenerate unicode tables
* refact : add tokenizer model
* lint : fix
* tests : disable failing tests
ggml-ci
* refact : add tests files
ggml-ci
* convert : print -> logging
ggml-ci
* lint : fix
* unicode : digit -> number
* phi-3 : update
Diffstat (limited to 'models')
24 files changed, 209 insertions, 0 deletions
diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-bert-bge.gguf.inp +++ b/models/ggml-vocab-bert-bge.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out index 969552e1..e4a76cdb 100644 --- a/models/ggml-vocab-bert-bge.gguf.out +++ b/models/ggml-vocab-bert-bge.gguf.out @@ -1,3 +1,5 @@ + 29464 2094 1018 1092 2706 + 11865 17875 diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.inp +++ b/models/ggml-vocab-deepseek-coder.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out index 8ef585c7..9ccc560d 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.out +++ b/models/ggml-vocab-deepseek-coder.gguf.out @@ -1,3 +1,5 @@ + 1050 207 19 207 19192 4217 + 37 32009 71 6247 207 243 diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.inp +++ b/models/ggml-vocab-deepseek-llm.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out index 0ea9d66e..fd94b896 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.out +++ b/models/ggml-vocab-deepseek-llm.gguf.out @@ -1,3 +1,5 @@ + 1052 207 19 207 19109 4223 + 37 100014 71 6245 207 243 diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-falcon.gguf.inp +++ b/models/ggml-vocab-falcon.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out index cb8da7b1..209b04cd 100644 --- a/models/ggml-vocab-falcon.gguf.out +++ b/models/ggml-vocab-falcon.gguf.out @@ -1,3 +1,5 @@ + 878 204 31 3068 133 2137 + 28611 132 30042 204 258 diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-gpt-2.gguf.inp +++ b/models/ggml-vocab-gpt-2.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out index 9986f38e..78430f0d 100644 --- a/models/ggml-vocab-gpt-2.gguf.out +++ b/models/ggml-vocab-gpt-2.gguf.out @@ -1,3 +1,5 @@ + 798 604 25208 1933 + 37 9116 71 11751 220 220 220 diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-llama-bpe.gguf.inp +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out index 4d903e1c..1f00e381 100644 --- a/models/ggml-vocab-llama-bpe.gguf.out +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -1,3 +1,5 @@ + 1142 220 19 220 27154 4038 + 37 51853 261 220 256 diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-llama-spm.gguf.inp +++ b/models/ggml-vocab-llama-spm.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out index 15d00b10..9c3327cb 100644 --- a/models/ggml-vocab-llama-spm.gguf.out +++ b/models/ggml-vocab-llama-spm.gguf.out @@ -1,3 +1,5 @@ + 474 287 29871 29946 29871 30226 7378 + 383 4000 261 259 1678 diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-mpt.gguf.inp +++ b/models/ggml-vocab-mpt.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out index 1f4b0eb3..d8d0fe90 100644 --- a/models/ggml-vocab-mpt.gguf.out +++ b/models/ggml-vocab-mpt.gguf.out @@ -1,3 +1,5 @@ + 728 577 24142 2607 + 39 26288 6554 209 50276 diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf Binary files differindex 72fdb409..f8022a38 100644 --- a/models/ggml-vocab-phi-3.gguf +++ b/models/ggml-vocab-phi-3.gguf diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-phi-3.gguf.inp +++ b/models/ggml-vocab-phi-3.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out index 15d00b10..9c3327cb 100644 --- a/models/ggml-vocab-phi-3.gguf.out +++ b/models/ggml-vocab-phi-3.gguf.out @@ -1,3 +1,5 @@ + 474 287 29871 29946 29871 30226 7378 + 383 4000 261 259 1678 diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf Binary files differindex 8f26cfb7..52afcf01 100644 --- a/models/ggml-vocab-refact.gguf +++ b/models/ggml-vocab-refact.gguf diff --git a/models/ggml-vocab-refact.gguf.inp b/models/ggml-vocab-refact.gguf.inp new file mode 100644 index 00000000..0a89107c --- /dev/null +++ b/models/ggml-vocab-refact.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-refact.gguf.out b/models/ggml-vocab-refact.gguf.out new file mode 100644 index 00000000..06b15c09 --- /dev/null +++ b/models/ggml-vocab-refact.gguf.out @@ -0,0 +1,43 @@ + 4833 225 38 225 143 140 17723 + 56 2006 3935 265 + + 225 + 261 + 264 + 202 + 203 + 478 + 2831 + 15773 + 8279 5788 + 12000 5788 + 8279 10896 + 12000 10896 + 12000 10896 19 + 8279 30 5788 19 + 12000 30 5788 19 + 458 438 5945 118 252 32 3766 + 105 34 38 42 225 41 102 1707 12530 10180 1479 8278 + 39862 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 + 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 14574 228 14574 254 14574 232 30457 228 14574 236 + 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 308 2585 22680 688 1401 2819 4369 2404 27 + 8279 + 12000 + 225 12000 + 261 12000 + 264 12000 + 264 12000 284 12000 + 308 + 203 280 + 25 34666 + 8279 30 533 25 464 19 4971 884 844 18458 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 + 37 + 37 37 + 37 37 37 + 37 37 37 37 + 37 37 37 37 37 + 37 37 37 37 37 37 + 37 37 37 37 37 37 37 + 37 37 37 37 37 37 37 37 + 37 37 37 37 37 37 37 37 37 + 334 719 8878 202 10885 4222 16104 28570 203 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 5945 118 252 3807 118 252 225 37 225 37 37 225 37 37 37 225 37 37 37 37 225 37 37 37 37 37 225 37 37 37 37 37 37 225 37 37 37 37 37 37 37 225 37 37 37 37 37 37 37 37 225 37 32 37 225 37 497 37 225 37 1179 37 225 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 36628 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 20921 16623 13028 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 5881 9592 13299 914 31753 31359 9163 3202 35472 10397 439 4763 2583 330 102 1455 938 1182 2017 30 330 613 844 3654 49 330 63 646 3654 439 4621 1930 561 30 330 54 844 2124 1629 35993 49 2688 25 7709 312 25 94 62 diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp index 0389f00c..0a89107c 100644 --- a/models/ggml-vocab-starcoder.gguf.inp +++ b/models/ggml-vocab-starcoder.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out index cd04254a..ccb55c7f 100644 --- a/models/ggml-vocab-starcoder.gguf.out +++ b/models/ggml-vocab-starcoder.gguf.out @@ -1,3 +1,5 @@ + 4850 244 57 244 162 159 17722 + 75 2022 3943 284 244 280 |