From eb34620aeceaf9d9df7fcb19acc17ad41b9f60f8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 21 Mar 2023 17:29:41 +0200 Subject: Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now --- utils.h | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'utils.h') diff --git a/utils.h b/utils.h index 65fe02ba..971cc0e9 100644 --- a/utils.h +++ b/utils.h @@ -60,7 +60,7 @@ std::string gpt_random_prompt(std::mt19937 & rng); // Vocab utils // -struct gpt_vocab { +struct llama_vocab { using id = int32_t; using token = std::string; @@ -74,34 +74,22 @@ void replace(std::string & str, const std::string & needle, const std::string & // poor-man's JSON parsing std::map json_parse(const std::string & fname); -// split text into tokens -// -// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 -// -// Regex (Python): -// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" -// -// Regex (C++): -// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" -// -std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); +// TODO: temporary until #77 is merged, need this now for some tokenizer tests +bool llama_vocab_load(const std::string & fname, llama_vocab & vocab); // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos); - -// load the tokens from encoder.json -bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); +std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos); // sample next token given probabilities for each embedding // // - consider only the top K tokens // - from them, consider only the top tokens with cumulative probability > P // -gpt_vocab::id llama_sample_top_p_top_k( - const gpt_vocab & vocab, +llama_vocab::id llama_sample_top_p_top_k( + const llama_vocab & vocab, const float * logits, - std::vector & last_n_tokens, + std::vector & last_n_tokens, double repeat_penalty, int top_k, double top_p, @@ -109,7 +97,7 @@ gpt_vocab::id llama_sample_top_p_top_k( std::mt19937 & rng); // filer to top K tokens from list of logits -void sample_top_k(std::vector> & logits_id, int top_k); +void sample_top_k(std::vector> & logits_id, int top_k); // // Quantization -- cgit v1.2.3