From 32c8486e1f0297393cb22ac0a0d26a6b17ad4d54 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 26 Mar 2024 17:46:21 -0400
Subject: wpm : portable unicode tolower (#6305)

Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
---
 llama.cpp | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'llama.cpp')
diff --git a/llama.cpp b/llama.cpp
index 22db79d6..892d46fb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -61,6 +61,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <cctype>
 #include <cfloat>
 #include <cinttypes>
 #include <climits>
@@ -71,7 +72,6 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <cwctype>
 #include <forward_list>
 #include <fstream>
 #include <functional>
@@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm {
             if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
                 continue;
             }
-            code = to_lower(code);
+            code = unicode_tolower(code);
             if (type == CODEPOINT_TYPE_WHITESPACE) {
                 code = ' ';
             }
@@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm {
         std::vector<std::string> words;
         while (r < new_str.size()) {
             // if is whitespace
-            if (isspace(new_str[r])) {
+            if (isspace(new_str[r], std::locale::classic())) {
                 if (r > l) words.push_back(new_str.substr(l, (r - l)));
                 l = r + 1;
                 r = l;
@@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm {
         return words;
     }
 
-    uint32_t to_lower(uint32_t code) {
-        static const std::locale locale("en_US.UTF-8");
-#if defined(_WIN32)
-        if (code > 0xFFFF) {
-            return code;
-        }
-#endif
-        return std::tolower(wchar_t(code), locale);
-    }
-
     bool is_ascii_punct(uint32_t code) {
-        return code < 256 && ispunct(code);
+        if (code > 0xFF) {
+            return false;
+        }
+        auto c = char(static_cast<unsigned char>(code));
+        return ispunct(c, std::locale::classic());
     }
 
     bool is_chinese_char(uint32_t cpt) {
-- 
cgit v1.2.3