quantize : add '--keep-split' to quantize model into shards (#6688)

* Implement '--keep-split' to quantize model into several shards * Add test script * Update examples/quantize/quantize.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Split model correctly even if tensor id is out-of-order * Update llama_model_quantize_params * Fix preci failures --------- Co-authored-by: z5269887 <z5269887@unsw.edu.au> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: jiez <373447296@qq.com> 2024-04-25 18:29:35 +0800
committer: GitHub <noreply@github.com> 2024-04-25 13:29:35 +0300
commit: 1966eb2615242f224bf9ca939db8905ab6a174a0 (patch)
tree: 3da33a1b5f816723e195a4936d44c4bef2eaa06a /examples/quantize/quantize.cpp
parent: 784e11dea1f5ce9638851b2b0dddb107e2a609c8 (diff)
1 files changed, 13 insertions, 2 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 64cb6db1..da1850df 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -97,6 +97,7 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --keep-split: will generate quatized model in the same shards as input");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
     printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -300,6 +301,8 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--keep-split")) {
+            params.keep_split = true;
         } else {
             usage(argv[0]);
         }
@@ -332,20 +335,28 @@ int main(int argc, char ** argv) {
     std::string fname_out;
 
     std::string ftype_str;
+    std::string suffix = ".gguf";
     if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
         std::string fpath;
         const size_t pos = fname_inp.find_last_of("/\\");
         if (pos != std::string::npos) {
             fpath = fname_inp.substr(0, pos + 1);
         }
-        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
+
+        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+        fname_out = fpath + "ggml-model-" + ftype_str;
+        if (!params.keep_split) {
+            fname_out += suffix;
+        }
         arg_idx++;
         if (ftype_str == "COPY") {
             params.only_copy = true;
         }
     } else {
         fname_out = argv[arg_idx];
+        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+        }
         arg_idx++;
 
         if (argc <= arg_idx) {
author	jiez <373447296@qq.com>	2024-04-25 18:29:35 +0800
committer	GitHub <noreply@github.com>	2024-04-25 13:29:35 +0300
commit	1966eb2615242f224bf9ca939db8905ab6a174a0 (patch)
tree	3da33a1b5f816723e195a4936d44c4bef2eaa06a /examples/quantize/quantize.cpp
parent	784e11dea1f5ce9638851b2b0dddb107e2a609c8 (diff)