diff options
Diffstat (limited to 'common')
-rw-r--r-- | common/common.cpp | 39 | ||||
-rw-r--r-- | common/common.h | 22 |
2 files changed, 57 insertions, 4 deletions
diff --git a/common/common.cpp b/common/common.cpp index ff19ec4e..0d91a6a3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -733,12 +733,12 @@ std::vector<llama_token> llama_tokenize( return result; } -std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector<char> result(8, 0); - const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -746,3 +746,36 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok return std::string(result.data(), result.size()); } + +std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) { + const llama_token bos_id = llama_token_bos(ctx); + + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + // remove the leading space of the first non-BOS token + if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { + piece = piece.substr(1); + } + + result += piece; + } + + return result; +} + +std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) { + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + result += piece; + } + + return result; +} diff --git a/common/common.h b/common/common.h index ce61265f..97fda2be 100644 --- a/common/common.h +++ b/common/common.h @@ -116,11 +116,31 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param // Vocab utils // +// tokenizes a string into a vector of tokens +// should work similar to Python's `tokenizer.encode` std::vector<llama_token> llama_tokenize( struct llama_context * ctx, const std::string & text, bool add_bos); -std::string llama_token_to_str( +// tokenizes a token into a piece +// should work similar to Python's `tokenizer.id_to_piece` +std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token); + +// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function +// that takes into account the tokenizer type and decides how to handle the leading space +// +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +// removes the leading space from the first non-BOS token +std::string llama_detokenize_spm( + llama_context * ctx, + const std::vector<llama_token> & tokens); + +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +std::string llama_detokenize_bpe( + llama_context * ctx, + const std::vector<llama_token> & tokens); |