From 233fc1c69f6f415f35363e18a755f9610e89161b Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Tue, 10 Oct 2023 18:59:52 +0200
Subject: Minor improvements in GPT2 tokenizer (#3567)

* Fixing minor bugs in bpe_gpt2_preprocess

* Don't add bos token in test
---
 tests/test-tokenizer-0-llama.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tests/test-tokenizer-0-llama.cpp')
diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp
index 91c841f7..39c8d188 100644
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -174,10 +174,8 @@ int main(int argc, char **argv) {
             }
 
             for (const auto & tok : res) {
-                ofs << tok << " ";
+                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
             }
-
-            ofs << "\n";
         }
 
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-- 
cgit v1.2.3