From edd4c1481708fcd788b0e423268304fd26e2b125 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Aug 2023 14:19:19 +0300
Subject: llama : more tokenizer fixes (#2810)

* tests : write a Python tokenizer test (wip)

* llama : prefix input text for tokenization with whitespace

* llama : distinguish pieces from decoded text + fix detokenization

* common : add comments

* examples : no longer manually add leading space when tokenizing

* tests : use Python to generate tokenizer tests for C++

* tests : add option to tokenize text files

ggml-ci

* tests : add test-tokenizer-1.py

* llama.cpp : fix LF token

* hellaswag : move the concat space for clarity

* tests : add falcon tests (py + cpp, currently do not pass Unicode)

ggml-ci

* common : temporary separate llama_detokenize calls for SPM and BPE

---------

Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
---
 examples/embedding/embedding.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'examples/embedding/embedding.cpp')

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 38395c75..93d583b5 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
 
     int n_past = 0;
 
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
     // tokenize the prompt
     auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
 
@@ -67,7 +64,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
         fprintf(stderr, "\n");
     }
-- 
cgit v1.2.3