summaryrefslogtreecommitdiff
path: root/unicode-data.h
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-05-04 08:32:32 +0300
committerGitHub <noreply@github.com>2024-05-04 08:32:32 +0300
commit92139b90af4841d7fd060b526bdd443b621770ff (patch)
tree9679c3de1b39970ca73b5bd988c63ddac0359ca6 /unicode-data.h
parenta2ac89d6efb41b535778bfeaecaae8fe295b6ed3 (diff)
tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)
* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
Diffstat (limited to 'unicode-data.h')
-rw-r--r--unicode-data.h2
1 files changed, 1 insertions, 1 deletions
diff --git a/unicode-data.h b/unicode-data.h
index cb9dd8aa..3cf84117 100644
--- a/unicode-data.h
+++ b/unicode-data.h
@@ -5,7 +5,7 @@
#include <utility>
#include <vector>
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
+extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;