From 074bea2eb1f1349a0118239c4152914aecaa1be4 Mon Sep 17 00:00:00 2001 From: Mack Straight Date: Mon, 20 Mar 2023 03:17:23 -0700 Subject: sentencepiece bpe compatible tokenizer (#252) * potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com> --- convert-pth-to-ggml.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'convert-pth-to-ggml.py') diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index c1941a81..42f53776 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype): keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] values = [ - 0x67676d6c, # magic: ggml in hex + 0x67676d66, # magic: ggml in hex + 1, # file version *[hparams[key] for key in keys], hparams["dim"] // hparams["n_heads"], # rot (obsolete) ftype @@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer): text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") fout.write(struct.pack("i", len(text))) fout.write(text) + fout.write(struct.pack("f", tokenizer.get_score(i))) def process_and_write_variables(fout, model, ftype): -- cgit v1.2.3