wpm : portable unicode tolower (#6305)

Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
author: Jared Van Bortel <jared@nomic.ai> 2024-03-26 17:46:21 -0400
committer: GitHub <noreply@github.com> 2024-03-26 17:46:21 -0400
commit: 32c8486e1f0297393cb22ac0a0d26a6b17ad4d54 (patch)
tree: aae4f945bf3355c009cb9643376439fcba5558d0 /llama.cpp
parent: 557410b8f06380560155ac7fcb8316d71ddc9837 (diff)
1 files changed, 8 insertions, 14 deletions
diff --git a/llama.cpp b/llama.cpp
index 22db79d6..892d46fb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -61,6 +61,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <cctype>
 #include <cfloat>
 #include <cinttypes>
 #include <climits>
@@ -71,7 +72,6 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <cwctype>
 #include <forward_list>
 #include <fstream>
 #include <functional>
@@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm {
             if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
                 continue;
             }
-            code = to_lower(code);
+            code = unicode_tolower(code);
             if (type == CODEPOINT_TYPE_WHITESPACE) {
                 code = ' ';
             }
@@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm {
         std::vector<std::string> words;
         while (r < new_str.size()) {
             // if is whitespace
-            if (isspace(new_str[r])) {
+            if (isspace(new_str[r], std::locale::classic())) {
                 if (r > l) words.push_back(new_str.substr(l, (r - l)));
                 l = r + 1;
                 r = l;
@@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm {
         return words;
     }
 
-    uint32_t to_lower(uint32_t code) {
-        static const std::locale locale("en_US.UTF-8");
-#if defined(_WIN32)
-        if (code > 0xFFFF) {
-            return code;
-        }
-#endif
-        return std::tolower(wchar_t(code), locale);
-    }
-
     bool is_ascii_punct(uint32_t code) {
-        return code < 256 && ispunct(code);
+        if (code > 0xFF) {
+            return false;
+        }
+        auto c = char(static_cast<unsigned char>(code));
+        return ispunct(c, std::locale::classic());
     }
 
     bool is_chinese_char(uint32_t cpt) {
author	Jared Van Bortel <jared@nomic.ai>	2024-03-26 17:46:21 -0400
committer	GitHub <noreply@github.com>	2024-03-26 17:46:21 -0400
commit	32c8486e1f0297393cb22ac0a0d26a6b17ad4d54 (patch)
tree	aae4f945bf3355c009cb9643376439fcba5558d0 /llama.cpp
parent	557410b8f06380560155ac7fcb8316d71ddc9837 (diff)