diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2023-08-27 16:50:33 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-27 16:50:33 +0300 |
commit | 463173a6c0ff353055eb90665794884c888c790f (patch) | |
tree | 4868e5ed0a6924410c91b149a6a630ea75ea06de /llama.cpp | |
parent | eaa13a48ff4136f01c1cdb79cacd61b67ec53095 (diff) |
llama : speedup tokenization (#2831)
* Speedup tokenization
On current master it takes ~3.2 seconds to tokenize
Wikitext. With this change it becomes ~525 ms.
* Fixit: it was missing the piece after the last found occurence
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 15 |
1 files changed, 10 insertions, 5 deletions
@@ -114,12 +114,17 @@ static size_t utf8_len(char src) { } void replace_all(std::string & s, const std::string & search, const std::string & replace) { - for (size_t pos = 0; ; pos += replace.length()) { - pos = s.find(search, pos); - if (pos == std::string::npos) break; - s.erase(pos, search.length()); - s.insert(pos, replace); + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; } + s = std::move(result); } static void zeros(std::ofstream & file, size_t n) { |