diff options
author | goerch <jhr.walter@t-online.de> | 2023-10-10 18:59:52 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-10 18:59:52 +0200 |
commit | 233fc1c69f6f415f35363e18a755f9610e89161b (patch) | |
tree | d949e9cdaa21419b2a03e7eeb81852cd7a5e6240 /tests | |
parent | c5b49360d0d9e49f32e05a9116e90bd0b39a282d (diff) |
Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess
* Don't add bos token in test
Diffstat (limited to 'tests')
-rw-r--r-- | tests/test-tokenizer-0-falcon.cpp | 8 | ||||
-rw-r--r-- | tests/test-tokenizer-0-falcon.py | 9 | ||||
-rw-r--r-- | tests/test-tokenizer-0-llama.cpp | 4 | ||||
-rw-r--r-- | tests/test-tokenizer-0-llama.py | 7 |
4 files changed, 13 insertions, 15 deletions
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 0f3c50bc..a4e9d2b9 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() { { " Hello" , { 258, 23090, }, }, { " Hello" , { 466, 23090, }, }, { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + { "\n =" , { 1212, 40, }, }, + { "' era" , { 18, 4932, }, }, }; return _k_tests; @@ -155,7 +157,7 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - const std::vector<llama_token> res = llama_tokenize(ctx, text, true); + const std::vector<llama_token> res = llama_tokenize(ctx, text, false); fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); @@ -169,10 +171,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " "; + ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl; } - - ofs << "\n"; } fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py index 9c8c1c7d..cf65a3f6 100644 --- a/tests/test-tokenizer-0-falcon.py +++ b/tests/test-tokenizer-0-falcon.py @@ -41,6 +41,8 @@ tests = [ " Hello", " Hello", " Hello\n Hello", + "\n =", + "' era", ] for text in tests: @@ -69,15 +71,14 @@ fname_tok = args.fname_tok if fname_tok: print('tokenizing file: ', fname_tok) fname_out = fname_tok + '.tok' - with open(fname_tok, 'r') as f: + with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) res = tokenizer.encode(s) # write to file - with open(fname_out, 'w') as f: + with open(fname_out, 'w', encoding='utf-8') as f: for x in res: - f.write(str(x) + ' ') - f.write('\n') + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') print('len(res): ', len(res)) print('len(lines): ', len(lines)) print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index 91c841f7..39c8d188 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -174,10 +174,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " "; + ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl; } - - ofs << "\n"; } fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py index bc164ee2..078f680b 100644 --- a/tests/test-tokenizer-0-llama.py +++ b/tests/test-tokenizer-0-llama.py @@ -81,15 +81,14 @@ fname_tok = args.fname_tok if fname_tok: print('tokenizing file: ', fname_tok) fname_out = fname_tok + '.tok' - with open(fname_tok, 'r') as f: + with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) res = tokenizer.encode(s, add_bos=True) # write to file - with open(fname_out, 'w') as f: + with open(fname_out, 'w', encoding='utf-8') as f: for x in res: - f.write(str(x) + ' ') - f.write('\n') + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') print('len(res): ', len(res)) print('len(lines): ', len(lines)) print('results written to: ', fname_out) |