falcon : use stated vocab size (#2914)

author: akawrykow <142945436+akawrykow@users.noreply.github.com> 2023-09-14 10:19:42 -0700
committer: GitHub <noreply@github.com> 2023-09-14 20:19:42 +0300
commit: 5c872dbca2c7979b1f6dafc97db0774b8bbf9372 (patch)
tree: 183c2ba0e30419495769b21e0b2cc2d3cde11bd2
parent: 990a5e226a1a0ac858abe3aa7e5f3b000d4fa665 (diff)
1 files changed, 3 insertions, 1 deletions
diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py
index 6ed2b88c..5d4ad04a 100755
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
 
 print("gguf: get gpt2 tokenizer vocab")
 
-vocab_size = len(tokenizer_json["model"]["vocab"])
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
 
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
author	akawrykow <142945436+akawrykow@users.noreply.github.com>	2023-09-14 10:19:42 -0700
committer	GitHub <noreply@github.com>	2023-09-14 20:19:42 +0300
commit	5c872dbca2c7979b1f6dafc97db0774b8bbf9372 (patch)
tree	183c2ba0e30419495769b21e0b2cc2d3cde11bd2
parent	990a5e226a1a0ac858abe3aa7e5f3b000d4fa665 (diff)