From 233fc1c69f6f415f35363e18a755f9610e89161b Mon Sep 17 00:00:00 2001 From: goerch Date: Tue, 10 Oct 2023 18:59:52 +0200 Subject: Minor improvements in GPT2 tokenizer (#3567) * Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test --- tests/test-tokenizer-0-falcon.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tests/test-tokenizer-0-falcon.cpp') diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 0f3c50bc..a4e9d2b9 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -36,6 +36,8 @@ static const std::map> & k_tests() { { " Hello" , { 258, 23090, }, }, { " Hello" , { 466, 23090, }, }, { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + { "\n =" , { 1212, 40, }, }, + { "' era" , { 18, 4932, }, }, }; return _k_tests; @@ -155,7 +157,7 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - const std::vector res = llama_tokenize(ctx, text, true); + const std::vector res = llama_tokenize(ctx, text, false); fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); @@ -169,10 +171,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " "; + ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector{tok}) << "'" << std::endl; } - - ofs << "\n"; } fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); -- cgit v1.2.3