summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-04-30 11:05:25 +0300
committerGitHub <noreply@github.com>2024-04-30 11:05:25 +0300
commit952d03dbead16e4dbdd1d3458486340673cc2465 (patch)
tree5c97cdb347d3ce27d4c315f7b26dd023aabe48cf
parent8843a98c2ba97a25e93319a104f9ddfaf83ce4c4 (diff)
convert : use utf8 encoding (#7000)
* convert : use utf8 encoding * convert : update instructions and warning message
-rw-r--r--convert-hf-to-gguf-update.py16
-rwxr-xr-xconvert-hf-to-gguf.py12
2 files changed, 18 insertions, 10 deletions
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 1c559c3f..b019c1e3 100644
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -128,7 +128,7 @@ for model in models:
print(f"chkhsh: {chkhsh}")
# print the "pre_tokenizer" content from the tokenizer.json
- with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
+ with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
pre_tokenizer = cfg["pre_tokenizer"]
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
@@ -156,15 +156,19 @@ src_func += " print(f\"chkhsh: {chkhsh}\")\n"
src_func += "\n"
src_func += " res = None\n"
src_func += "\n"
-src_func += " # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
-src_func += " # don't do this manually - use the convert-hf-to-gguf-update.py script!\n"
+src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
+src_func += " # or pull the latest version of the model from Huggingface\n"
+src_func += " # don't edit the hashes manually!\n"
src_func += f"{src_ifs}\n"
src_func += " if res is None:\n"
src_func += " print(\"\\n\")\n"
src_func += " print(\"**************************************************************************************\")\n"
src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
-src_func += " print(\"** This means that it was not added yet or you are using an older version.\")\n"
-src_func += " print(\"** Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
+src_func += " print(\"** There are 2 possible reasons for this:\")\n"
+src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
+src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n"
+src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
+src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
src_func += " print(\"**\")\n"
src_func += " print(f\"** chkhsh: {chkhsh}\")\n"
src_func += " print(\"**************************************************************************************\")\n"
@@ -249,7 +253,7 @@ for model in models:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
- with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
+ with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
for text in tests:
f.write(f"{text}")
f.write("\n__ggml_vocab_test__\n")
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index d1b8cef1..2f146d73 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -279,8 +279,9 @@ class Model(ABC):
res = None
- # NOTE: if you get an error here, you need to add the model to the if-elif chain below
- # don't do this manually - use the convert-hf-to-gguf-update.py script!
+ # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
+ # or pull the latest version of the model from Huggingface
+ # don't edit the hashes manually!
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
@@ -310,8 +311,11 @@ class Model(ABC):
print("\n")
print("**************************************************************************************")
print("** WARNING: The BPE pre-tokenizer was not recognized!")
- print("** This means that it was not added yet or you are using an older version.")
- print("** Check convert-hf-to-gguf-update.py and update it accordingly.")
+ print("** There are 2 possible reasons for this:")
+ print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
+ print("** - the pre-tokenization config has changed upstream")
+ print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+ print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
print("**")
print(f"** chkhsh: {chkhsh}")
print("**************************************************************************************")