sentencepiece bpe compatible tokenizer (#252)

* potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
author: Mack Straight <eiz@users.noreply.github.com> 2023-03-20 03:17:23 -0700
committer: GitHub <noreply@github.com> 2023-03-20 03:17:23 -0700
commit: 074bea2eb1f1349a0118239c4152914aecaa1be4 (patch)
tree: 41ce911ac28d858cabfeff650b10521b30838656 /quantize.cpp
parent: 5cb63e2493c49bc2c3b9b355696e8dc26cdd0380 (diff)
1 files changed, 23 insertions, 1 deletions
diff --git a/quantize.cpp b/quantize.cpp
index 14c7b277..166e9163 100644
--- a/quantize.cpp
+++ b/quantize.cpp
@@ -3,6 +3,7 @@
 #include "utils.h"
 
 #include <cassert>
+#include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
     {
         uint32_t magic;
         finp.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic == 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
+                    __func__, fname_inp.c_str());
+            return false;
+        }
+        if (magic != 0x67676d66) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
             return false;
         }
 
         fout.write((char *) &magic, sizeof(magic));
+
+        uint32_t format_version;
+        finp.read((char *) &format_version, sizeof(format_version));
+
+        if (format_version != 1) {
+            fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
+                    __func__, fname_inp.c_str(), format_version);
+            return false;
+        }
+
+        fout.write((char *) &format_version, sizeof(format_version));
     }
 
     llama_hparams hparams;
@@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
             finp.read ((char *) word.data(), len);
             fout.write((char *) word.data(), len);
 
+            float score;
+            finp.read ((char *) &score, sizeof(score));
+            fout.write((char *) &score, sizeof(score));
+
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
+            vocab.score[i] = score;
         }
     }
author	Mack Straight <eiz@users.noreply.github.com>	2023-03-20 03:17:23 -0700
committer	GitHub <noreply@github.com>	2023-03-20 03:17:23 -0700
commit	074bea2eb1f1349a0118239c4152914aecaa1be4 (patch)
tree	41ce911ac28d858cabfeff650b10521b30838656 /quantize.cpp
parent	5cb63e2493c49bc2c3b9b355696e8dc26cdd0380 (diff)