From 92139b90af4841d7fd060b526bdd443b621770ff Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 4 May 2024 08:32:32 +0300 Subject: tests : add test-tokenizer-0.sh + fix some tokenizers (#7036) * tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update --- unicode-data.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'unicode-data.h') diff --git a/unicode-data.h b/unicode-data.h index cb9dd8aa..3cf84117 100644 --- a/unicode-data.h +++ b/unicode-data.h @@ -5,7 +5,7 @@ #include #include -extern const std::vector> unicode_ranges_digit; +extern const std::vector> unicode_ranges_number; extern const std::vector> unicode_ranges_letter; extern const std::vector> unicode_ranges_whitespace; extern const std::vector> unicode_ranges_accent_mark; -- cgit v1.2.3