From eb34620aeceaf9d9df7fcb19acc17ad41b9f60f8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 21 Mar 2023 17:29:41 +0200
Subject: Add tokenizer test + revert to C++11 (#355)

* Add test-tokenizer-0 to do a few tokenizations - feel free to expand
* Added option to convert-pth-to-ggml.py script to dump just the vocabulary
* Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests)
* Added utility to load vocabulary file from previous point (temporary implementation)
* Avoid using std::string_view and drop back to C++11 (hope I didn't break something)
* Rename gpt_vocab -> llama_vocab
* All CMake binaries go into ./bin/ now
---
 main.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'main.cpp')
diff --git a/main.cpp b/main.cpp
index 3321818d..e97611e2 100644
--- a/main.cpp
+++ b/main.cpp
@@ -90,7 +90,7 @@ struct llama_model {
 };
 
 // load the model's weights from a file
-bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
+bool llama_model_load(const std::string & fname, llama_model & model, llama_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     std::vector<char> f_buf(1024*1024);
@@ -544,9 +544,9 @@ bool llama_eval(
         const llama_model & model,
         const int n_threads,
         const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
+        const std::vector<llama_vocab::id> & embd_inp,
+              std::vector<float>           & embd_w,
+              size_t                       & mem_per_token) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -832,7 +832,7 @@ int main(int argc, char ** argv) {
 
     int64_t t_load_us = 0;
 
-    gpt_vocab vocab;
+    llama_vocab vocab;
     llama_model model;
 
     // load the model
@@ -864,13 +864,13 @@ int main(int argc, char ** argv) {
     // Add a space in front of the first character to match OG llama tokenizer behavior
     params.prompt.insert(0, 1, ' ');
     // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<llama_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
 
     params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
 
     // prefix & suffix for instruct mode
-    const std::vector<gpt_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
-    const std::vector<gpt_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);
+    const std::vector<llama_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
+    const std::vector<llama_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);
 
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
@@ -879,8 +879,8 @@ int main(int argc, char ** argv) {
     }
 
     // tokenize the reverse prompt
-    std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;
-    
+    std::vector<std::vector<llama_vocab::id>> antipromptv_inp;
+
     for (auto antiprompt : params.antiprompt) {
         antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false));
     }
@@ -925,14 +925,14 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
     fprintf(stderr, "\n\n");
 
-    std::vector<gpt_vocab::id> embd;
+    std::vector<llama_vocab::id> embd;
 
     // determine the required inference memory per token:
     size_t mem_per_token = 0;
     llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
 
     int last_n_size = params.repeat_last_n;
-    std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
+    std::vector<llama_vocab::id> last_n_tokens(last_n_size);
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
 
     if (params.interactive) {
@@ -980,7 +980,7 @@ int main(int argc, char ** argv) {
 
             const int n_vocab = model.hparams.n_vocab;
 
-            gpt_vocab::id id = 0;
+            llama_vocab::id id = 0;
 
             {
                 const int64_t t_start_sample_us = ggml_time_us();
@@ -1066,7 +1066,7 @@ int main(int argc, char ** argv) {
                 } while (another_line);
                 if (params.use_color) printf(ANSI_COLOR_RESET);
 
-                std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
+                std::vector<llama_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
                 embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                 if (params.instruct) {
-- 
cgit v1.2.3