Minor improvements in GPT2 tokenizer (#3567)

* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
author: goerch <jhr.walter@t-online.de> 2023-10-10 18:59:52 +0200
committer: GitHub <noreply@github.com> 2023-10-10 18:59:52 +0200
commit: 233fc1c69f6f415f35363e18a755f9610e89161b (patch)
tree: d949e9cdaa21419b2a03e7eeb81852cd7a5e6240 /tests/test-tokenizer-0-falcon.cpp
parent: c5b49360d0d9e49f32e05a9116e90bd0b39a282d (diff)
1 files changed, 4 insertions, 4 deletions
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp
index 0f3c50bc..a4e9d2b9 100644
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
         { "   Hello"              , {     258,  23090, }, },
         { "    Hello"             , {     466,  23090, }, },
         { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
+        { "\n ="                  , {    1212,     40, }, },
+        { "' era"                 , {      18,   4932, }, },
     };
 
     return _k_tests;
@@ -155,7 +157,7 @@ int main(int argc, char **argv) {
 
         fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
 
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
 
         fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
 
@@ -169,10 +171,8 @@ int main(int argc, char **argv) {
             }
 
             for (const auto & tok : res) {
-                ofs << tok << " ";
+                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
             }
-
-            ofs << "\n";
         }
 
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
author	goerch <jhr.walter@t-online.de>	2023-10-10 18:59:52 +0200
committer	GitHub <noreply@github.com>	2023-10-10 18:59:52 +0200
commit	233fc1c69f6f415f35363e18a755f9610e89161b (patch)
tree	d949e9cdaa21419b2a03e7eeb81852cd7a5e6240 /tests/test-tokenizer-0-falcon.cpp
parent	c5b49360d0d9e49f32e05a9116e90bd0b39a282d (diff)