llama : escape all U+2581 in a string (#2750)

author: Georgi Gerganov <ggerganov@gmail.com> 2023-08-24 12:26:01 +0300
committer: GitHub <noreply@github.com> 2023-08-24 12:26:01 +0300
commit: c3e53b421a9910548be0345f85712c535f467a98 (patch)
tree: a32a4e6947831e4d8de17c1b096d143be24b2fc0
parent: 6e91a1b0706c2e0e52b9d9be7ee82d3c1e7a33c1 (diff)
1 files changed, 3 insertions, 6 deletions
diff --git a/llama.cpp b/llama.cpp
index 7cac8a1c..f5526e30 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3004,11 +3004,8 @@ static std::string llama_escape_whitespace(const std::string& text) {
     return result;
 }
 
-static std::string llama_unescape_whitespace(const std::string& word) {
-    if (word.length() >= 3 && word.substr(0, 3) == "\xe2\x96\x81") {
-        return std::string(" ") + word.substr(3);
-    }
-    return word;
+static void llama_unescape_whitespace(std::string & word) {
+    replace_all(word, "\xe2\x96\x81", " ");
 }
 
 struct llm_symbol {
@@ -5822,7 +5819,7 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
         if (llama_is_normal_token(model->vocab, token)) {
             std::string result = model->vocab.id_to_token[token].text;
             if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
-                result = llama_unescape_whitespace(result);
+                llama_unescape_whitespace(result);
             }
             if (length < (int) result.length()) {
                 return -result.length();
author	Georgi Gerganov <ggerganov@gmail.com>	2023-08-24 12:26:01 +0300
committer	GitHub <noreply@github.com>	2023-08-24 12:26:01 +0300
commit	c3e53b421a9910548be0345f85712c535f467a98 (patch)
tree	a32a4e6947831e4d8de17c1b096d143be24b2fc0
parent	6e91a1b0706c2e0e52b9d9be7ee82d3c1e7a33c1 (diff)