diff options
author | goerch <jhr.walter@t-online.de> | 2023-09-13 15:19:44 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-13 16:19:44 +0300 |
commit | 71ca2fad7d6c0ef95ef9944fb3a1a843e481f314 (patch) | |
tree | 21bc810807b527d5892e8184d5f1dae0b184e923 /llama.cpp | |
parent | 1b6c650d16048d6427dd502a9627e72837265844 (diff) |
whisper : tokenizer fix + re-enable tokenizer test for LLaMa (#3096)
* Fix für #2721
* Reenable tokenizer test for LLaMa
* Add `console.cpp` dependency
* Fix dependency to `common`
* Fixing wrong fix.
* Make console usage platform specific
Work on compiler warnings.
* Adapting makefile
* Remove trailing whitespace
* Adapting the other parts of the makefile
* Fix typo.
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 7 |
1 files changed, 3 insertions, 4 deletions
@@ -3121,10 +3121,9 @@ struct llm_tokenizer_spm { while (offs < text.size()) { llm_symbol sym; size_t len = utf8_len(text[offs]); - GGML_ASSERT(offs + len <= text.size()); sym.text = text.c_str() + offs; - sym.n = len; - offs += len; + sym.n = std::min(len, text.size() - offs); + offs += sym.n; sym.prev = index - 1; sym.next = offs == text.size() ? -1 : index + 1; index++; @@ -6218,7 +6217,7 @@ int llama_tokenize_with_model( auto res = llama_tokenize_internal(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { - LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); + // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); return -((int) res.size()); } |