From 71ca2fad7d6c0ef95ef9944fb3a1a843e481f314 Mon Sep 17 00:00:00 2001 From: goerch Date: Wed, 13 Sep 2023 15:19:44 +0200 Subject: whisper : tokenizer fix + re-enable tokenizer test for LLaMa (#3096) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix für #2721 * Reenable tokenizer test for LLaMa * Add `console.cpp` dependency * Fix dependency to `common` * Fixing wrong fix. * Make console usage platform specific Work on compiler warnings. * Adapting makefile * Remove trailing whitespace * Adapting the other parts of the makefile * Fix typo. --- llama.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 2a2a0c9c..cbaf8eda 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3121,10 +3121,9 @@ struct llm_tokenizer_spm { while (offs < text.size()) { llm_symbol sym; size_t len = utf8_len(text[offs]); - GGML_ASSERT(offs + len <= text.size()); sym.text = text.c_str() + offs; - sym.n = len; - offs += len; + sym.n = std::min(len, text.size() - offs); + offs += sym.n; sym.prev = index - 1; sym.next = offs == text.size() ? -1 : index + 1; index++; @@ -6218,7 +6217,7 @@ int llama_tokenize_with_model( auto res = llama_tokenize_internal(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { - LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); + // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); return -((int) res.size()); } -- cgit v1.2.3