summaryrefslogtreecommitdiff
path: root/convert-gpt4all-to-ggml.py
diff options
context:
space:
mode:
Diffstat (limited to 'convert-gpt4all-to-ggml.py')
-rw-r--r--convert-gpt4all-to-ggml.py107
1 files changed, 0 insertions, 107 deletions
diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py
deleted file mode 100644
index b1a5e056..00000000
--- a/convert-gpt4all-to-ggml.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-
-#
-# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
-#
-
-# Original by https://github.com/eiz
-# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
-import argparse
-import glob
-import os
-import struct
-import sys
-from sentencepiece import SentencePieceProcessor
-
-HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-
-def parse_args():
- parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
- parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
- parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
- return parser.parse_args()
-
-def read_header(f_in):
- struct_fmt = "i" * (3 + len(HPARAMS))
- struct_size = struct.calcsize(struct_fmt)
- buf = f_in.read(struct_size)
- return struct.unpack(struct_fmt, buf)
-
-def write_header(f_out, header):
- (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
-
- if magic != 0x67676d6c:
- raise Exception('Invalid file magic. Must be an old style ggml file.')
-
- values = [
- 0x67676d66, # magic: ggml in hex
- 1, # file version
- vocab_size,
- dim,
- multiple_of,
- n_heads,
- n_layers,
- rot,
- ftype
- ]
- f_out.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
- for i in range(tokenizer.vocab_size()):
- if tokenizer.is_unknown(i):
- text = " \u2047 ".encode()
- elif tokenizer.is_control(i):
- text = b""
- elif tokenizer.is_byte(i):
- piece = tokenizer.id_to_piece(i)
- if len(piece) != 6:
- print(f"Invalid token: {piece}")
- sys.exit(1)
- byte_value = int(piece[3:-1], 16)
- text = struct.pack("B", byte_value)
- else:
- text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
- fout.write(struct.pack("i", len(text)))
- fout.write(text)
- fout.write(struct.pack("f", tokenizer.get_score(i)))
-
- # TODO: GPT4All - add extra <pad> token
- text = "<pad>".encode()
- fout.write(struct.pack("i", len(text)))
- fout.write(text)
- fout.write(struct.pack("f", 0.0))
-
-def read_tokens(f_in, tokenizer):
- for i in range(tokenizer.vocab_size()):
- len_b = f_in.read(4)
- (length,) = struct.unpack("i", len_b)
- f_in.read(length)
-
-def copy_all_data(f_out, f_in):
- while True:
- buf = f_in.read(1024 * 1024)
- if not buf:
- break
- f_out.write(buf)
-
-def convert_one_file(path_in, tokenizer):
- path_tmp = f"{path_in}.tmp"
- path_orig= f"{path_in}.orig"
- print(f"converting {path_in}")
- with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
- write_header(f_out, read_header(f_in))
- read_tokens(f_in, tokenizer)
- write_tokens(f_out, tokenizer)
- copy_all_data(f_out, f_in)
- os.rename(path_in, path_orig)
- os.rename(path_tmp, path_in)
-
-def main():
- args = parse_args()
-
- tokenizer = SentencePieceProcessor(args.tokenizer_model)
-
- convert_one_file(args.gpt4all_model, tokenizer)
-
-if __name__ == "__main__":
- main()