From edd4c1481708fcd788b0e423268304fd26e2b125 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 27 Aug 2023 14:19:19 +0300
Subject: llama : more tokenizer fixes (#2810)

* tests : write a Python tokenizer test (wip)

* llama : prefix input text for tokenization with whitespace

* llama : distinguish pieces from decoded text + fix detokenization

* common : add comments

* examples : no longer manually add leading space when tokenizing

* tests : use Python to generate tokenizer tests for C++

* tests : add option to tokenize text files

ggml-ci

* tests : add test-tokenizer-1.py

* llama.cpp : fix LF token

* hellaswag : move the concat space for clarity

* tests : add falcon tests (py + cpp, currently do not pass Unicode)

ggml-ci

* common : temporary separate llama_detokenize calls for SPM and BPE

---------

Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
---
 common/common.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'common/common.h')

diff --git a/common/common.h b/common/common.h
index ce61265f..97fda2be 100644
--- a/common/common.h
+++ b/common/common.h
@@ -116,11 +116,31 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 // Vocab utils
 //
 
+// tokenizes a string into a vector of tokens
+// should work similar to Python's `tokenizer.encode`
 std::vector<llama_token> llama_tokenize(
         struct llama_context * ctx,
            const std::string & text,
                         bool   add_bos);
 
-std::string llama_token_to_str(
+// tokenizes a token into a piece
+// should work similar to Python's `tokenizer.id_to_piece`
+std::string llama_token_to_piece(
         const struct llama_context * ctx,
                        llama_token   token);
+
+// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
+//       that takes into account the tokenizer type and decides how to handle the leading space
+//
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+// removes the leading space from the first non-BOS token
+std::string llama_detokenize_spm(
+                         llama_context * ctx,
+        const std::vector<llama_token> & tokens);
+
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+std::string llama_detokenize_bpe(
+                         llama_context * ctx,
+        const std::vector<llama_token> & tokens);
-- 
cgit v1.2.3