whisper : tokenizer fix + re-enable tokenizer test for LLaMa (#3096)

* Fix für #2721 * Reenable tokenizer test for LLaMa * Add `console.cpp` dependency * Fix dependency to `common` * Fixing wrong fix. * Make console usage platform specific Work on compiler warnings. * Adapting makefile * Remove trailing whitespace * Adapting the other parts of the makefile * Fix typo.
author: goerch <jhr.walter@t-online.de> 2023-09-13 15:19:44 +0200
committer: GitHub <noreply@github.com> 2023-09-13 16:19:44 +0300
commit: 71ca2fad7d6c0ef95ef9944fb3a1a843e481f314 (patch)
tree: 21bc810807b527d5892e8184d5f1dae0b184e923 /llama.cpp
parent: 1b6c650d16048d6427dd502a9627e72837265844 (diff)
1 files changed, 3 insertions, 4 deletions
diff --git a/llama.cpp b/llama.cpp
index 2a2a0c9c..cbaf8eda 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3121,10 +3121,9 @@ struct llm_tokenizer_spm {
         while (offs < text.size()) {
             llm_symbol sym;
             size_t len = utf8_len(text[offs]);
-            GGML_ASSERT(offs + len <= text.size());
             sym.text = text.c_str() + offs;
-            sym.n = len;
-            offs += len;
+            sym.n = std::min(len, text.size() - offs);
+            offs += sym.n;
             sym.prev = index - 1;
             sym.next = offs == text.size() ? -1 : index + 1;
             index++;
@@ -6218,7 +6217,7 @@ int llama_tokenize_with_model(
     auto res = llama_tokenize_internal(model->vocab, text, add_bos);
 
     if (n_max_tokens < (int) res.size()) {
-        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
         return -((int) res.size());
     }
author	goerch <jhr.walter@t-online.de>	2023-09-13 15:19:44 +0200
committer	GitHub <noreply@github.com>	2023-09-13 16:19:44 +0300
commit	71ca2fad7d6c0ef95ef9944fb3a1a843e481f314 (patch)
tree	21bc810807b527d5892e8184d5f1dae0b184e923 /llama.cpp
parent	1b6c650d16048d6427dd502a9627e72837265844 (diff)