From b08e75baea294e366628b898e85c0bd359b58115 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Sat, 16 Sep 2023 13:41:33 +0200
Subject: Fixing the last deviations from sentencepiece indicated by
 test-tokenizer-1 (#3170)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix für #2721

* Reenable tokenizer test for LLaMa

* Add `console.cpp` dependency

* Fix dependency to `common`

* Fixing wrong fix.

* Make console usage platform specific

Work on compiler warnings.

* Adapting makefile

* Remove trailing whitespace

* Adapting the other parts of the makefile

* Fix typo.

* Fixing the last deviations from sentencepiece indicated by test-tokenizer-1

* Simplify logic

* Add missing change...

* Fix ugly compiler warning

* llama_tokenize should accept strings containing NUL now

* Adding huichen's test case
---
 llama.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'llama.cpp')

diff --git a/llama.cpp b/llama.cpp
index a6502612..0b334b4e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7032,19 +7032,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
 int llama_tokenize(
         struct llama_context * ctx,
                   const char * text,
+                         int   text_len,
                  llama_token * tokens,
                          int   n_max_tokens,
                         bool   add_bos) {
-    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
+    return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
 }
 
 int llama_tokenize_with_model(
     const struct llama_model * model,
                   const char * text,
+                         int   text_len,
                  llama_token * tokens,
                          int   n_max_tokens,
                         bool   add_bos) {
-    auto res = llama_tokenize_internal(model->vocab, text, add_bos);
+    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
 
     if (n_max_tokens < (int) res.size()) {
         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
-- 
cgit v1.2.3