From eb34620aeceaf9d9df7fcb19acc17ad41b9f60f8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 21 Mar 2023 17:29:41 +0200 Subject: Add tokenizer test + revert to C++11 (#355) * Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now --- main.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'main.cpp') diff --git a/main.cpp b/main.cpp index 3321818d..e97611e2 100644 --- a/main.cpp +++ b/main.cpp @@ -90,7 +90,7 @@ struct llama_model { }; // load the model's weights from a file -bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) { +bool llama_model_load(const std::string & fname, llama_model & model, llama_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) { fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); std::vector f_buf(1024*1024); @@ -544,9 +544,9 @@ bool llama_eval( const llama_model & model, const int n_threads, const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token) { + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -832,7 +832,7 @@ int main(int argc, char ** argv) { int64_t t_load_us = 0; - gpt_vocab vocab; + llama_vocab vocab; llama_model model; // load the model @@ -864,13 +864,13 @@ int main(int argc, char ** argv) { // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); // tokenize the prompt - std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); + std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); // prefix & suffix for instruct mode - const std::vector inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true); - const std::vector inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false); + const std::vector inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true); + const std::vector inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false); // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { @@ -879,8 +879,8 @@ int main(int argc, char ** argv) { } // tokenize the reverse prompt - std::vector> antipromptv_inp; - + std::vector> antipromptv_inp; + for (auto antiprompt : params.antiprompt) { antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false)); } @@ -925,14 +925,14 @@ int main(int argc, char ** argv) { fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); fprintf(stderr, "\n\n"); - std::vector embd; + std::vector embd; // determine the required inference memory per token: size_t mem_per_token = 0; llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); int last_n_size = params.repeat_last_n; - std::vector last_n_tokens(last_n_size); + std::vector last_n_tokens(last_n_size); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); if (params.interactive) { @@ -980,7 +980,7 @@ int main(int argc, char ** argv) { const int n_vocab = model.hparams.n_vocab; - gpt_vocab::id id = 0; + llama_vocab::id id = 0; { const int64_t t_start_sample_us = ggml_time_us(); @@ -1066,7 +1066,7 @@ int main(int argc, char ** argv) { } while (another_line); if (params.use_color) printf(ANSI_COLOR_RESET); - std::vector line_inp = ::llama_tokenize(vocab, buffer, false); + std::vector line_inp = ::llama_tokenize(vocab, buffer, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); if (params.instruct) { -- cgit v1.2.3