llama : more tokenizer fixes (#2810)

* tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
author: Georgi Gerganov <ggerganov@gmail.com> 2023-08-27 14:19:19 +0300
committer: GitHub <noreply@github.com> 2023-08-27 14:19:19 +0300
commit: edd4c1481708fcd788b0e423268304fd26e2b125 (patch)
tree: 2e7db62ea4816dc18f2518a08c36b6ea480eff05 /examples/train-text-from-scratch
parent: 1591e2e590762011b43b10a9b6e04f13f98f2aa5 (diff)
1 files changed, 2 insertions, 2 deletions
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 79b117df..12d15341 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1964,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {
 
 
 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token).c_str());
+    printf("%s", llama_token_to_piece(ctx, token).c_str());
 }
 
 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -2202,7 +2202,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
         const char * in  = buf.data();
         const char * end = buf.data() + buf.size();
         for (int i = 0; i < (int) out.size(); ++i) {
-            std::string s = llama_token_to_str(lctx, out[i]);
+            std::string s = llama_token_to_piece(lctx, out[i]);
             int len = s.length();
             if (in >= end) {
                 printf("%s: unexpected end of original text.\n", __func__);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-08-27 14:19:19 +0300
committer	GitHub <noreply@github.com>	2023-08-27 14:19:19 +0300
commit	edd4c1481708fcd788b0e423268304fd26e2b125 (patch)
tree	2e7db62ea4816dc18f2518a08c36b6ea480eff05 /examples/train-text-from-scratch
parent	1591e2e590762011b43b10a9b6e04f13f98f2aa5 (diff)