llama : optimize long word tokenization with WPM (#8034)

ggml-ci
author: Georgi Gerganov <ggerganov@gmail.com> 2024-06-21 08:51:28 +0300
committer: GitHub <noreply@github.com> 2024-06-21 08:51:28 +0300
commit: a927b0f3dd9a86ee042cd2bdcc8c9da4a855926b (patch)
tree: cb718eba3db4e581a0814d3f0b6a952b7a3218b1 /unicode.cpp
parent: 80ea089d771f0c2d97afa8bead80ded412f600d7 (diff)
1 files changed, 1 insertions, 0 deletions
diff --git a/unicode.cpp b/unicode.cpp
index 913c34b9..c0b76bf2 100644
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
 
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
     std::vector<uint32_t> result;
+    result.reserve(utf8.size());
     size_t offset = 0;
     while (offset < utf8.size()) {
         result.push_back(unicode_cpt_from_utf8(utf8, offset));
author	Georgi Gerganov <ggerganov@gmail.com>	2024-06-21 08:51:28 +0300
committer	GitHub <noreply@github.com>	2024-06-21 08:51:28 +0300
commit	a927b0f3dd9a86ee042cd2bdcc8c9da4a855926b (patch)
tree	cb718eba3db4e581a0814d3f0b6a952b7a3218b1 /unicode.cpp
parent	80ea089d771f0c2d97afa8bead80ded412f600d7 (diff)