diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-06-21 08:51:28 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-21 08:51:28 +0300 |
commit | a927b0f3dd9a86ee042cd2bdcc8c9da4a855926b (patch) | |
tree | cb718eba3db4e581a0814d3f0b6a952b7a3218b1 /unicode.cpp | |
parent | 80ea089d771f0c2d97afa8bead80ded412f600d7 (diff) |
llama : optimize long word tokenization with WPM (#8034)
ggml-ci
Diffstat (limited to 'unicode.cpp')
-rw-r--r-- | unicode.cpp | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/unicode.cpp b/unicode.cpp index 913c34b9..c0b76bf2 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) { std::vector<uint32_t> result; + result.reserve(utf8.size()); size_t offset = 0; while (offset < utf8.size()) { result.push_back(unicode_cpt_from_utf8(utf8, offset)); |