diff options
author | howlger <github@voormann.de> | 2023-12-21 18:07:34 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-21 19:07:34 +0200 |
commit | 880e352277fc017df4d5794f0c21c44e1eae2b84 (patch) | |
tree | 9764edb3beee2d19df543d3363983bd4a26f3d29 | |
parent | 66f35a2f48e1965a13835a523e677223dbf148be (diff) |
py : open merges file as 'utf-8' (#4566)
Otherwise, on Windows converting bling-phi-2-v0 (<https://huggingface.co/llmware/bling-phi-2-v0>) via convert-hf-to-gguf.py will fail with the following error:
```
Traceback (most recent call last):
File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 1061, in <module>
model_instance.set_vocab()
File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 52, in set_vocab
self._set_vocab_gpt2()
File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 264, in _set_vocab_gpt2
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
File "C:\Users\User\git\gguf\gguf\vocab.py", line 33, in __init__
self._load(Path(path))
File "C:\Users\User\git\gguf\gguf\vocab.py", line 81, in _load
self._try_load_merges_txt(path)
File "C:\Users\User\git\gguf\gguf\vocab.py", line 95, in _try_load_merges_txt
for line in fp:
File "C:\Users\User\miniconda3\envs\gguf\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1415: character maps to <undefined>
```
-rw-r--r-- | gguf-py/gguf/vocab.py | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 76924d8f..cd194297 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -84,7 +84,7 @@ class SpecialVocab: merges_file = path / 'merges.txt' if not merges_file.is_file(): return False - with open(merges_file, 'r') as fp: + with open(merges_file, 'r', encoding = 'utf-8') as fp: first_line = next(fp, '').strip() if not first_line.startswith('#'): fp.seek(0) |