summaryrefslogtreecommitdiff
path: root/tests/test-tokenizer-0-llama.cpp
diff options
context:
space:
mode:
authorgoerch <jhr.walter@t-online.de>2023-10-10 18:59:52 +0200
committerGitHub <noreply@github.com>2023-10-10 18:59:52 +0200
commit233fc1c69f6f415f35363e18a755f9610e89161b (patch)
treed949e9cdaa21419b2a03e7eeb81852cd7a5e6240 /tests/test-tokenizer-0-llama.cpp
parentc5b49360d0d9e49f32e05a9116e90bd0b39a282d (diff)
Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
Diffstat (limited to 'tests/test-tokenizer-0-llama.cpp')
-rw-r--r--tests/test-tokenizer-0-llama.cpp4
1 files changed, 1 insertions, 3 deletions
diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp
index 91c841f7..39c8d188 100644
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -174,10 +174,8 @@ int main(int argc, char **argv) {
}
for (const auto & tok : res) {
- ofs << tok << " ";
+ ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
}
-
- ofs << "\n";
}
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());