From eb34620aeceaf9d9df7fcb19acc17ad41b9f60f8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 21 Mar 2023 17:29:41 +0200
Subject: Add tokenizer test + revert to C++11 (#355)

* Add test-tokenizer-0 to do a few tokenizations - feel free to expand
* Added option to convert-pth-to-ggml.py script to dump just the vocabulary
* Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests)
* Added utility to load vocabulary file from previous point (temporary implementation)
* Avoid using std::string_view and drop back to C++11 (hope I didn't break something)
* Rename gpt_vocab -> llama_vocab
* All CMake binaries go into ./bin/ now
---
 utils.h | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

(limited to 'utils.h')

diff --git a/utils.h b/utils.h
index 65fe02ba..971cc0e9 100644
--- a/utils.h
+++ b/utils.h
@@ -60,7 +60,7 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 // Vocab utils
 //
 
-struct gpt_vocab {
+struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
 
@@ -74,34 +74,22 @@ void replace(std::string & str, const std::string & needle, const std::string &
 // poor-man's JSON parsing
 std::map<std::string, int32_t> json_parse(const std::string & fname);
 
-// split text into tokens
-//
-// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
-//
-// Regex (Python):
-// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-//
-// Regex (C++):
-// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
-//
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
+// TODO: temporary until #77 is merged, need this now for some tokenizer tests
+bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);
 
 // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 // ref: https://github.com/google/sentencepiece
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);
-
-// load the tokens from encoder.json
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
+std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);
 
 // sample next token given probabilities for each embedding
 //
 //   - consider only the top K tokens
 //   - from them, consider only the top tokens with cumulative probability > P
 //
-gpt_vocab::id llama_sample_top_p_top_k(
-        const gpt_vocab & vocab,
+llama_vocab::id llama_sample_top_p_top_k(
+        const llama_vocab & vocab,
         const float * logits,
-        std::vector<gpt_vocab::id> & last_n_tokens,
+        std::vector<llama_vocab::id> & last_n_tokens,
         double repeat_penalty,
         int top_k,
         double top_p,
@@ -109,7 +97,7 @@ gpt_vocab::id llama_sample_top_p_top_k(
         std::mt19937 & rng);
 
 // filer to top K tokens from list of logits
-void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k);
+void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k);
 
 //
 // Quantization
-- 
cgit v1.2.3