diff options
author | Douglas Hanley <thesecretaryofwar@gmail.com> | 2024-03-01 03:15:36 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-01 11:15:36 +0200 |
commit | 9600d59e010c18f5872580a21734ea1bf1968d04 (patch) | |
tree | 18727401c21ec11498aeabd1543e6a9893da472a /llama.cpp | |
parent | 5cb02b4a012bb16c6c699c0c62c05ffa653eee0f (diff) |
unicode : switch to multimap based nfd_map (#5799)
* switch to multimap based nfd_map due to compile time issues
* simplify multimap keys
* dont construct new locale every time
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 11 |
1 files changed, 6 insertions, 5 deletions
@@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm { std::vector<uint32_t> codepoints = codepoints_from_utf8(text); std::vector<uint32_t> nfd_codepoints; for (uint32_t code : codepoints) { - auto it = nfd_map.find(code); - if (it != nfd_map.end()) { - for (uint32_t c : it->second) { - nfd_codepoints.push_back(c); + auto it = nfd_map.equal_range(code); + if (it.first != it.second) { + for (auto jt = it.first; jt != it.second; jt++) { + nfd_codepoints.push_back(jt->second); } } else { nfd_codepoints.push_back(code); @@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm { } uint32_t to_lower(uint32_t code) { + static const std::locale locale("en_US.UTF-8"); #if defined(_WIN32) if (code > 0xFFFF) { return code; } #endif - return std::tolower(wchar_t(code), std::locale("en_US.UTF-8")); + return std::tolower(wchar_t(code), locale); } bool is_ascii_punct(uint32_t code) { |