summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorgoerch <jhr.walter@t-online.de>2023-09-13 15:19:44 +0200
committerGitHub <noreply@github.com>2023-09-13 16:19:44 +0300
commit71ca2fad7d6c0ef95ef9944fb3a1a843e481f314 (patch)
tree21bc810807b527d5892e8184d5f1dae0b184e923 /llama.cpp
parent1b6c650d16048d6427dd502a9627e72837265844 (diff)
whisper : tokenizer fix + re-enable tokenizer test for LLaMa (#3096)
* Fix für #2721 * Reenable tokenizer test for LLaMa * Add `console.cpp` dependency * Fix dependency to `common` * Fixing wrong fix. * Make console usage platform specific Work on compiler warnings. * Adapting makefile * Remove trailing whitespace * Adapting the other parts of the makefile * Fix typo.
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp7
1 files changed, 3 insertions, 4 deletions
diff --git a/llama.cpp b/llama.cpp
index 2a2a0c9c..cbaf8eda 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3121,10 +3121,9 @@ struct llm_tokenizer_spm {
while (offs < text.size()) {
llm_symbol sym;
size_t len = utf8_len(text[offs]);
- GGML_ASSERT(offs + len <= text.size());
sym.text = text.c_str() + offs;
- sym.n = len;
- offs += len;
+ sym.n = std::min(len, text.size() - offs);
+ offs += sym.n;
sym.prev = index - 1;
sym.next = offs == text.size() ? -1 : index + 1;
index++;
@@ -6218,7 +6217,7 @@ int llama_tokenize_with_model(
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
if (n_max_tokens < (int) res.size()) {
- LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size());
}