summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp11
1 files changed, 6 insertions, 5 deletions
diff --git a/llama.cpp b/llama.cpp
index 62699ce5..a35f07aa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm {
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
std::vector<uint32_t> nfd_codepoints;
for (uint32_t code : codepoints) {
- auto it = nfd_map.find(code);
- if (it != nfd_map.end()) {
- for (uint32_t c : it->second) {
- nfd_codepoints.push_back(c);
+ auto it = nfd_map.equal_range(code);
+ if (it.first != it.second) {
+ for (auto jt = it.first; jt != it.second; jt++) {
+ nfd_codepoints.push_back(jt->second);
}
} else {
nfd_codepoints.push_back(code);
@@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm {
}
uint32_t to_lower(uint32_t code) {
+ static const std::locale locale("en_US.UTF-8");
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;
}
#endif
- return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
+ return std::tolower(wchar_t(code), locale);
}
bool is_ascii_punct(uint32_t code) {