From 074bea2eb1f1349a0118239c4152914aecaa1be4 Mon Sep 17 00:00:00 2001
From: Mack Straight <eiz@users.noreply.github.com>
Date: Mon, 20 Mar 2023 03:17:23 -0700
Subject: sentencepiece bpe compatible tokenizer (#252)

* potential out of bounds read

* fix quantize

* style

* Update convert-pth-to-ggml.py

* mild cleanup

* don't need the space-prefixing here rn since main.cpp already does it

* new file magic + version header field

* readme notice

* missing newlines

Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
---
 convert-pth-to-ggml.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'convert-pth-to-ggml.py')

diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index c1941a81..42f53776 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype):
 
     keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
     values = [
-        0x67676d6c,  # magic: ggml in hex
+        0x67676d66,  # magic: ggml in hex
+        1, # file version
         *[hparams[key] for key in keys],
         hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
         ftype
@@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer):
             text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
         fout.write(struct.pack("i", len(text)))
         fout.write(text)
+        fout.write(struct.pack("f", tokenizer.get_score(i)))
 
 def process_and_write_variables(fout, model, ftype):
 
-- 
cgit v1.2.3