summaryrefslogtreecommitdiff
path: root/unicode.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-06-21 08:51:28 +0300
committerGitHub <noreply@github.com>2024-06-21 08:51:28 +0300
commita927b0f3dd9a86ee042cd2bdcc8c9da4a855926b (patch)
treecb718eba3db4e581a0814d3f0b6a952b7a3218b1 /unicode.cpp
parent80ea089d771f0c2d97afa8bead80ded412f600d7 (diff)
llama : optimize long word tokenization with WPM (#8034)
ggml-ci
Diffstat (limited to 'unicode.cpp')
-rw-r--r--unicode.cpp1
1 files changed, 1 insertions, 0 deletions
diff --git a/unicode.cpp b/unicode.cpp
index 913c34b9..c0b76bf2 100644
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
std::vector<uint32_t> result;
+ result.reserve(utf8.size());
size_t offset = 0;
while (offset < utf8.size()) {
result.push_back(unicode_cpt_from_utf8(utf8, offset));