From 32c8486e1f0297393cb22ac0a0d26a6b17ad4d54 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 26 Mar 2024 17:46:21 -0400 Subject: wpm : portable unicode tolower (#6305) Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp. --- llama.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 22db79d6..892d46fb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -71,7 +72,6 @@ #include #include #include -#include #include #include #include @@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm { if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) { continue; } - code = to_lower(code); + code = unicode_tolower(code); if (type == CODEPOINT_TYPE_WHITESPACE) { code = ' '; } @@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm { std::vector words; while (r < new_str.size()) { // if is whitespace - if (isspace(new_str[r])) { + if (isspace(new_str[r], std::locale::classic())) { if (r > l) words.push_back(new_str.substr(l, (r - l))); l = r + 1; r = l; @@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm { return words; } - uint32_t to_lower(uint32_t code) { - static const std::locale locale("en_US.UTF-8"); -#if defined(_WIN32) - if (code > 0xFFFF) { - return code; - } -#endif - return std::tolower(wchar_t(code), locale); - } - bool is_ascii_punct(uint32_t code) { - return code < 256 && ispunct(code); + if (code > 0xFF) { + return false; + } + auto c = char(static_cast(code)); + return ispunct(c, std::locale::classic()); } bool is_chinese_char(uint32_t cpt) { -- cgit v1.2.3