quantize : add '--keep-split' to quantize model into shards (#6688)

* Implement '--keep-split' to quantize model into several shards * Add test script * Update examples/quantize/quantize.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Split model correctly even if tensor id is out-of-order * Update llama_model_quantize_params * Fix preci failures --------- Co-authored-by: z5269887 <z5269887@unsw.edu.au> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: jiez <373447296@qq.com> 2024-04-25 18:29:35 +0800
committer: GitHub <noreply@github.com> 2024-04-25 13:29:35 +0300
commit: 1966eb2615242f224bf9ca939db8905ab6a174a0 (patch)
tree: 3da33a1b5f816723e195a4936d44c4bef2eaa06a /examples
parent: 784e11dea1f5ce9638851b2b0dddb107e2a609c8 (diff)
2 files changed, 78 insertions, 2 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 64cb6db1..da1850df 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -97,6 +97,7 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --keep-split: will generate quatized model in the same shards as input");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
     printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -300,6 +301,8 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--keep-split")) {
+            params.keep_split = true;
         } else {
             usage(argv[0]);
         }
@@ -332,20 +335,28 @@ int main(int argc, char ** argv) {
     std::string fname_out;
 
     std::string ftype_str;
+    std::string suffix = ".gguf";
     if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
         std::string fpath;
         const size_t pos = fname_inp.find_last_of("/\\");
         if (pos != std::string::npos) {
             fpath = fname_inp.substr(0, pos + 1);
         }
-        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
+
+        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+        fname_out = fpath + "ggml-model-" + ftype_str;
+        if (!params.keep_split) {
+            fname_out += suffix;
+        }
         arg_idx++;
         if (ftype_str == "COPY") {
             params.only_copy = true;
         }
     } else {
         fname_out = argv[arg_idx];
+        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+        }
         arg_idx++;
 
         if (argc <= arg_idx) {
diff --git a/examples/quantize/test.sh b/examples/quantize/test.sh
new file mode 100644
index 00000000..840f712a
--- /dev/null
+++ b/examples/quantize/test.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+  echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+  echo "example: $0 ../../build/bin ../../tmp"
+  exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+  TMP_DIR=$2
+else
+  TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+QUANTIZE=$1/quantize
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/quantize
+CUR_DIR=$(pwd)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
+
+# 1. Get a model
+(
+  cd $WORK_PATH
+  "$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split model
+$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 3. Requant model with '--keep_split'
+$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+echo PASS
+echo
+
+# 3a. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Requant mode without '--keep_split'
+$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
+echo PASS
+echo
+
+# 4b. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
author	jiez <373447296@qq.com>	2024-04-25 18:29:35 +0800
committer	GitHub <noreply@github.com>	2024-04-25 13:29:35 +0300
commit	1966eb2615242f224bf9ca939db8905ab6a174a0 (patch)
tree	3da33a1b5f816723e195a4936d44c4bef2eaa06a /examples
parent	784e11dea1f5ce9638851b2b0dddb107e2a609c8 (diff)