summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhowlger <github@voormann.de>2023-12-21 18:07:34 +0100
committerGitHub <noreply@github.com>2023-12-21 19:07:34 +0200
commit880e352277fc017df4d5794f0c21c44e1eae2b84 (patch)
tree9764edb3beee2d19df543d3363983bd4a26f3d29
parent66f35a2f48e1965a13835a523e677223dbf148be (diff)
py : open merges file as 'utf-8' (#4566)
Otherwise, on Windows converting bling-phi-2-v0 (<https://huggingface.co/llmware/bling-phi-2-v0>) via convert-hf-to-gguf.py will fail with the following error: ``` Traceback (most recent call last): File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 1061, in <module> model_instance.set_vocab() File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 52, in set_vocab self._set_vocab_gpt2() File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 264, in _set_vocab_gpt2 special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) File "C:\Users\User\git\gguf\gguf\vocab.py", line 33, in __init__ self._load(Path(path)) File "C:\Users\User\git\gguf\gguf\vocab.py", line 81, in _load self._try_load_merges_txt(path) File "C:\Users\User\git\gguf\gguf\vocab.py", line 95, in _try_load_merges_txt for line in fp: File "C:\Users\User\miniconda3\envs\gguf\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1415: character maps to <undefined> ```
-rw-r--r--gguf-py/gguf/vocab.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index 76924d8f..cd194297 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -84,7 +84,7 @@ class SpecialVocab:
merges_file = path / 'merges.txt'
if not merges_file.is_file():
return False
- with open(merges_file, 'r') as fp:
+ with open(merges_file, 'r', encoding = 'utf-8') as fp:
first_line = next(fp, '').strip()
if not first_line.startswith('#'):
fp.seek(0)