From 233fc1c69f6f415f35363e18a755f9610e89161b Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Tue, 10 Oct 2023 18:59:52 +0200
Subject: Minor improvements in GPT2 tokenizer (#3567)

* Fixing minor bugs in bpe_gpt2_preprocess

* Don't add bos token in test
---
 tests/test-tokenizer-0-llama.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'tests/test-tokenizer-0-llama.py')

diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py
index bc164ee2..078f680b 100644
--- a/tests/test-tokenizer-0-llama.py
+++ b/tests/test-tokenizer-0-llama.py
@@ -81,15 +81,14 @@ fname_tok = args.fname_tok
 if fname_tok:
     print('tokenizing file: ', fname_tok)
     fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r') as f:
+    with open(fname_tok, 'r', encoding='utf-8') as f:
         lines = f.readlines()
         s = ''.join(lines)
         res = tokenizer.encode(s, add_bos=True)
         # write to file
-        with open(fname_out, 'w') as f:
+        with open(fname_out, 'w', encoding='utf-8') as f:
             for x in res:
-                f.write(str(x) + ' ')
-            f.write('\n')
+                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
         print('len(res): ', len(res))
         print('len(lines): ', len(lines))
     print('results written to: ', fname_out)
-- 
cgit v1.2.3