summaryrefslogtreecommitdiff
path: root/tests/test-tokenizer-0-falcon.py
diff options
context:
space:
mode:
authorgoerch <jhr.walter@t-online.de>2023-10-10 18:59:52 +0200
committerGitHub <noreply@github.com>2023-10-10 18:59:52 +0200
commit233fc1c69f6f415f35363e18a755f9610e89161b (patch)
treed949e9cdaa21419b2a03e7eeb81852cd7a5e6240 /tests/test-tokenizer-0-falcon.py
parentc5b49360d0d9e49f32e05a9116e90bd0b39a282d (diff)
Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
Diffstat (limited to 'tests/test-tokenizer-0-falcon.py')
-rw-r--r--tests/test-tokenizer-0-falcon.py9
1 files changed, 5 insertions, 4 deletions
diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py
index 9c8c1c7d..cf65a3f6 100644
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@@ -41,6 +41,8 @@ tests = [
" Hello",
" Hello",
" Hello\n Hello",
+ "\n =",
+ "' era",
]
for text in tests:
@@ -69,15 +71,14 @@ fname_tok = args.fname_tok
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
- with open(fname_tok, 'r') as f:
+ with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s)
# write to file
- with open(fname_out, 'w') as f:
+ with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
- f.write(str(x) + ' ')
- f.write('\n')
+ f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)