From edd4c1481708fcd788b0e423268304fd26e2b125 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 14:19:19 +0300 Subject: llama : more tokenizer fixes (#2810) * tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> --- common/common.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'common/common.h') diff --git a/common/common.h b/common/common.h index ce61265f..97fda2be 100644 --- a/common/common.h +++ b/common/common.h @@ -116,11 +116,31 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param // Vocab utils // +// tokenizes a string into a vector of tokens +// should work similar to Python's `tokenizer.encode` std::vector llama_tokenize( struct llama_context * ctx, const std::string & text, bool add_bos); -std::string llama_token_to_str( +// tokenizes a token into a piece +// should work similar to Python's `tokenizer.id_to_piece` +std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token); + +// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function +// that takes into account the tokenizer type and decides how to handle the leading space +// +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +// removes the leading space from the first non-BOS token +std::string llama_detokenize_spm( + llama_context * ctx, + const std::vector & tokens); + +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +std::string llama_detokenize_bpe( + llama_context * ctx, + const std::vector & tokens); -- cgit v1.2.3