summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp22
1 files changed, 8 insertions, 14 deletions
diff --git a/llama.cpp b/llama.cpp
index 22db79d6..892d46fb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -61,6 +61,7 @@
#include <algorithm>
#include <array>
#include <cassert>
+#include <cctype>
#include <cfloat>
#include <cinttypes>
#include <climits>
@@ -71,7 +72,6 @@
#include <cstdio>
#include <cstring>
#include <ctime>
-#include <cwctype>
#include <forward_list>
#include <fstream>
#include <functional>
@@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm {
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
continue;
}
- code = to_lower(code);
+ code = unicode_tolower(code);
if (type == CODEPOINT_TYPE_WHITESPACE) {
code = ' ';
}
@@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm {
std::vector<std::string> words;
while (r < new_str.size()) {
// if is whitespace
- if (isspace(new_str[r])) {
+ if (isspace(new_str[r], std::locale::classic())) {
if (r > l) words.push_back(new_str.substr(l, (r - l)));
l = r + 1;
r = l;
@@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm {
return words;
}
- uint32_t to_lower(uint32_t code) {
- static const std::locale locale("en_US.UTF-8");
-#if defined(_WIN32)
- if (code > 0xFFFF) {
- return code;
- }
-#endif
- return std::tolower(wchar_t(code), locale);
- }
-
bool is_ascii_punct(uint32_t code) {
- return code < 256 && ispunct(code);
+ if (code > 0xFF) {
+ return false;
+ }
+ auto c = char(static_cast<unsigned char>(code));
+ return ispunct(c, std::locale::classic());
}
bool is_chinese_char(uint32_t cpt) {