diff options
author | Jared Van Bortel <jared@nomic.ai> | 2024-03-26 17:46:21 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-26 17:46:21 -0400 |
commit | 32c8486e1f0297393cb22ac0a0d26a6b17ad4d54 (patch) | |
tree | aae4f945bf3355c009cb9643376439fcba5558d0 /llama.cpp | |
parent | 557410b8f06380560155ac7fcb8316d71ddc9837 (diff) |
wpm : portable unicode tolower (#6305)
Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 22 |
1 files changed, 8 insertions, 14 deletions
@@ -61,6 +61,7 @@ #include <algorithm> #include <array> #include <cassert> +#include <cctype> #include <cfloat> #include <cinttypes> #include <climits> @@ -71,7 +72,6 @@ #include <cstdio> #include <cstring> #include <ctime> -#include <cwctype> #include <forward_list> #include <fstream> #include <functional> @@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm { if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) { continue; } - code = to_lower(code); + code = unicode_tolower(code); if (type == CODEPOINT_TYPE_WHITESPACE) { code = ' '; } @@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm { std::vector<std::string> words; while (r < new_str.size()) { // if is whitespace - if (isspace(new_str[r])) { + if (isspace(new_str[r], std::locale::classic())) { if (r > l) words.push_back(new_str.substr(l, (r - l))); l = r + 1; r = l; @@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm { return words; } - uint32_t to_lower(uint32_t code) { - static const std::locale locale("en_US.UTF-8"); -#if defined(_WIN32) - if (code > 0xFFFF) { - return code; - } -#endif - return std::tolower(wchar_t(code), locale); - } - bool is_ascii_punct(uint32_t code) { - return code < 256 && ispunct(code); + if (code > 0xFF) { + return false; + } + auto c = char(static_cast<unsigned char>(code)); + return ispunct(c, std::locale::classic()); } bool is_chinese_char(uint32_t cpt) { |