summaryrefslogtreecommitdiff
path: root/llama.h
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-08-27 14:19:19 +0300
committerGitHub <noreply@github.com>2023-08-27 14:19:19 +0300
commitedd4c1481708fcd788b0e423268304fd26e2b125 (patch)
tree2e7db62ea4816dc18f2518a08c36b6ea480eff05 /llama.h
parent1591e2e590762011b43b10a9b6e04f13f98f2aa5 (diff)
llama : more tokenizer fixes (#2810)
* tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
Diffstat (limited to 'llama.h')
-rw-r--r--llama.h10
1 files changed, 6 insertions, 4 deletions
diff --git a/llama.h b/llama.h
index b77dd773..b084fe23 100644
--- a/llama.h
+++ b/llama.h
@@ -381,15 +381,17 @@ extern "C" {
int n_max_tokens,
bool add_bos);
- // Token Id -> String. Uses the vocabulary in the provided context
- // Does not write null terminator to the buffer
- LLAMA_API int llama_token_to_str(
+ // Token Id -> Piece.
+ // Uses the vocabulary in the provided context.
+ // Does not write null terminator to the buffer.
+ // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+ LLAMA_API int llama_token_to_piece(
const struct llama_context * ctx,
llama_token token,
char * buf,
int length);
- LLAMA_API int llama_token_to_str_with_model(
+ LLAMA_API int llama_token_to_piece_with_model(
const struct llama_model * model,
llama_token token,
char * buf,