llama : more tokenizer fixes (#2810)

* tests : write a Python tokenizer test (wip) * llama : prefix input text for tokenization with whitespace * llama : distinguish pieces from decoded text + fix detokenization * common : add comments * examples : no longer manually add leading space when tokenizing * tests : use Python to generate tokenizer tests for C++ * tests : add option to tokenize text files ggml-ci * tests : add test-tokenizer-1.py * llama.cpp : fix LF token * hellaswag : move the concat space for clarity * tests : add falcon tests (py + cpp, currently do not pass Unicode) ggml-ci * common : temporary separate llama_detokenize calls for SPM and BPE --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
author: Georgi Gerganov <ggerganov@gmail.com> 2023-08-27 14:19:19 +0300
committer: GitHub <noreply@github.com> 2023-08-27 14:19:19 +0300
commit: edd4c1481708fcd788b0e423268304fd26e2b125 (patch)
tree: 2e7db62ea4816dc18f2518a08c36b6ea480eff05 /examples/perplexity/perplexity.cpp
parent: 1591e2e590762011b43b10a9b6e04f13f98f2aa5 (diff)
1 files changed, 2 insertions, 2 deletions
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index fd89852d..b596d062 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -392,7 +392,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         hs_data[i].context = prompt_lines[idx*6];
         hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
         for (size_t j=0; j < 4; j++) {
-            hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
+            hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
         }
 
         // Delete the selected random example from the prompt
@@ -417,7 +417,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         size_t context_size = context_embd.size();
 
         for (int i = 0; i < 4; ++i) {
-            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos);
+            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
             for (int k = 0; k < int(context_size); ++k) {
                 if (ending_tokens[i][k] != context_embd[k]) {
                     fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
author	Georgi Gerganov <ggerganov@gmail.com>	2023-08-27 14:19:19 +0300
committer	GitHub <noreply@github.com>	2023-08-27 14:19:19 +0300
commit	edd4c1481708fcd788b0e423268304fd26e2b125 (patch)
tree	2e7db62ea4816dc18f2518a08c36b6ea480eff05 /examples/perplexity/perplexity.cpp
parent	1591e2e590762011b43b10a9b6e04f13f98f2aa5 (diff)