From 233fc1c69f6f415f35363e18a755f9610e89161b Mon Sep 17 00:00:00 2001 From: goerch Date: Tue, 10 Oct 2023 18:59:52 +0200 Subject: Minor improvements in GPT2 tokenizer (#3567) * Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test --- tests/test-tokenizer-0-llama.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tests/test-tokenizer-0-llama.py') diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py index bc164ee2..078f680b 100644 --- a/tests/test-tokenizer-0-llama.py +++ b/tests/test-tokenizer-0-llama.py @@ -81,15 +81,14 @@ fname_tok = args.fname_tok if fname_tok: print('tokenizing file: ', fname_tok) fname_out = fname_tok + '.tok' - with open(fname_tok, 'r') as f: + with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) res = tokenizer.encode(s, add_bos=True) # write to file - with open(fname_out, 'w') as f: + with open(fname_out, 'w', encoding='utf-8') as f: for x in res: - f.write(str(x) + ' ') - f.write('\n') + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') print('len(res): ', len(res)) print('len(lines): ', len(lines)) print('results written to: ', fname_out) -- cgit v1.2.3