summaryrefslogtreecommitdiff
path: root/convert-falcon-hf-to-gguf.py
diff options
context:
space:
mode:
Diffstat (limited to 'convert-falcon-hf-to-gguf.py')
-rwxr-xr-xconvert-falcon-hf-to-gguf.py168
1 files changed, 75 insertions, 93 deletions
diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py
index 168bcf17..0fdea70e 100755
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -8,6 +8,7 @@ import struct
import json
import numpy as np
import torch
+import argparse
from typing import Any, List
from pathlib import Path
@@ -32,11 +33,10 @@ def bytes_to_unicode():
bs.append(b)
cs.append(2**8+n)
n += 1
- cs = [chr(n) for n in cs]
- return dict(zip(bs, cs))
+ return dict(zip(bs, (chr(n) for n in cs)))
-def count_model_parts(dir_model: str) -> int:
+def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
@@ -47,16 +47,21 @@ def count_model_parts(dir_model: str) -> int:
return num_parts
-if len(sys.argv) < 3:
- print(f"Usage: python {sys.argv[0]} dir-model ftype\n")
- print(" ftype == 0 -> float32")
- print(" ftype == 1 -> float16")
- sys.exit(1)
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
+ parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
+ return parser.parse_args()
+args = parse_args()
-# output in the same directory as the model
-dir_model = sys.argv[1]
-last_dir = os.path.basename(os.path.normpath(dir_model))
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
+ sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
@@ -65,25 +70,21 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
# map from ftype to string
ftype_str = ["f32", "f16"]
-ftype = 1
-if len(sys.argv) > 2:
- ftype = int(sys.argv[2])
- if ftype < 0 or ftype > 1:
- print("Invalid ftype: " + str(ftype))
-
- sys.exit(1)
-
-fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+if args.outfile is not None:
+ fname_out = args.outfile
+else:
+ # output in the same directory as the model by default
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-print("gguf: loading model "+last_dir)
+print("gguf: loading model "+dir_model.name)
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "RWForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
- sys.exit()
+ sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
@@ -113,77 +114,58 @@ gguf_writer.add_file_type(ftype)
print("gguf: get tokenizer metadata")
-tokens: List[str] = []
+tokens: List[bytearray] = []
scores: List[float] = []
toktypes: List[int] = []
-merges: List[str] = []
-
-
-if Path(dir_model + "/tokenizer.json").is_file():
- # gpt2 tokenizer
- gguf_writer.add_tokenizer_model("gpt2")
- print("gguf: get gpt2 tokenizer merges")
-
- with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
- tokenizer_json = json.load(f)
- merges = tokenizer_json["model"]["merges"]
-
- gguf_writer.add_token_merges(merges)
-
- print("gguf: get gpt2 tokenizer vocab")
-
- vocab_size = len(tokenizer_json["model"]["vocab"])
-
- # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
- tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
- reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
- byte_encoder = bytes_to_unicode()
- byte_decoder = {v: k for k, v in byte_encoder.items()}
+tokenizer_json_file = dir_model / 'tokenizer.json'
+if not tokenizer_json_file.is_file():
+ print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
+ sys.exit(1)
- for i in range(vocab_size):
- if i in reverse_vocab:
- try:
- text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
- except KeyError:
- text = bytearray()
- for c in reverse_vocab[i]:
- if ord(c) < 256: # single byte character
- text.append(byte_decoder[ord(c)])
- else: # multibyte special token character
- text.extend(c.encode('utf-8'))
- else:
- print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
- pad_token = f"[PAD{i}]".encode("utf8")
- text = bytearray(pad_token)
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
- tokens.append(text)
- scores.append(0.0) # dymmy
- toktypes.append(gguf.TokenType.NORMAL) # dummy
+with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+ tokenizer_json = json.load(f)
- gguf_writer.add_token_list(tokens)
- gguf_writer.add_token_scores(scores)
- gguf_writer.add_token_types(toktypes)
+print("gguf: get gpt2 tokenizer vocab")
-print("gguf: get special token ids")
-# Look for special tokens in config.json
+vocab_size = len(tokenizer_json["model"]["vocab"])
-if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
- gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
-if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
- gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}
-if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
- gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+for i in range(vocab_size):
+ if i in reverse_vocab:
+ try:
+ text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+ except KeyError:
+ text = bytearray()
+ for c in reverse_vocab[i]:
+ if ord(c) < 256: # single byte character
+ text.append(byte_decoder[ord(c)])
+ else: # multibyte special token character
+ text.extend(c.encode('utf-8'))
+ else:
+ print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+ pad_token = f"[PAD{i}]".encode("utf8")
+ text = bytearray(pad_token)
-if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
- gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+ tokens.append(text)
+ scores.append(0.0) # dymmy
+ toktypes.append(gguf.TokenType.NORMAL) # dummy
-if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
- gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab.add_to_gguf(gguf_writer)
# TENSORS
@@ -199,15 +181,17 @@ head_dim = hparams["hidden_size"] // n_head
print("gguf: get tensor metadata")
if num_parts == 0:
- part_names = ("pytorch_model.bin",)
+ part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
+ if args.vocab_only:
+ break
print("gguf: loading model part '" + part_name + "'")
- model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+ model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
@@ -238,11 +222,8 @@ for part_name in part_names:
data = data.squeeze().numpy()
# map tensor names
- if name.endswith(".weight") and name[:-7] in tensor_map:
- name = tensor_map[name[:-7]] + ".weight"
- elif name.endswith(".bias") and name[:-5] in tensor_map:
- name = tensor_map[name[:-5]] + ".bias"
- else:
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+ if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
@@ -261,19 +242,20 @@ for part_name in part_names:
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
- print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+ print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
- gguf_writer.add_tensor(name, data)
+ gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
-print("gguf: write tensors")
-gguf_writer.write_tensors_to_file()
+if not args.vocab_only:
+ print("gguf: write tensors")
+ gguf_writer.write_tensors_to_file()
gguf_writer.close()
-print("gguf: model successfully exported to '" + fname_out + "'")
+print(f"gguf: model successfully exported to '{fname_out}'")
print("")