From edd4c1481708fcd788b0e423268304fd26e2b125 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 14:19:19 +0300 Subject: llama : more tokenizer fixes (#2810) * tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> --- common/common.cpp | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'common/common.cpp') diff --git a/common/common.cpp b/common/common.cpp index ff19ec4e..0d91a6a3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -733,12 +733,12 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -746,3 +746,36 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok return std::string(result.data(), result.size()); } + +std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { + const llama_token bos_id = llama_token_bos(ctx); + + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + // remove the leading space of the first non-BOS token + if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { + piece = piece.substr(1); + } + + result += piece; + } + + return result; +} + +std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) { + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + result += piece; + } + + return result; +} -- cgit v1.2.3