summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorjiez <373447296@qq.com>2024-04-25 18:29:35 +0800
committerGitHub <noreply@github.com>2024-04-25 13:29:35 +0300
commit1966eb2615242f224bf9ca939db8905ab6a174a0 (patch)
tree3da33a1b5f816723e195a4936d44c4bef2eaa06a /examples
parent784e11dea1f5ce9638851b2b0dddb107e2a609c8 (diff)
quantize : add '--keep-split' to quantize model into shards (#6688)
* Implement '--keep-split' to quantize model into several shards * Add test script * Update examples/quantize/quantize.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Split model correctly even if tensor id is out-of-order * Update llama_model_quantize_params * Fix preci failures --------- Co-authored-by: z5269887 <z5269887@unsw.edu.au> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples')
-rw-r--r--examples/quantize/quantize.cpp15
-rw-r--r--examples/quantize/test.sh65
2 files changed, 78 insertions, 2 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 64cb6db1..da1850df 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -97,6 +97,7 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+ printf(" --keep-split: will generate quatized model in the same shards as input");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -300,6 +301,8 @@ int main(int argc, char ** argv) {
} else {
usage(argv[0]);
}
+ } else if (strcmp(argv[arg_idx], "--keep-split")) {
+ params.keep_split = true;
} else {
usage(argv[0]);
}
@@ -332,20 +335,28 @@ int main(int argc, char ** argv) {
std::string fname_out;
std::string ftype_str;
+ std::string suffix = ".gguf";
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
std::string fpath;
const size_t pos = fname_inp.find_last_of("/\\");
if (pos != std::string::npos) {
fpath = fname_inp.substr(0, pos + 1);
}
- // export as [inp path]/ggml-model-[ftype].gguf
- fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
+
+ // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+ fname_out = fpath + "ggml-model-" + ftype_str;
+ if (!params.keep_split) {
+ fname_out += suffix;
+ }
arg_idx++;
if (ftype_str == "COPY") {
params.only_copy = true;
}
} else {
fname_out = argv[arg_idx];
+ if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+ fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+ }
arg_idx++;
if (argc <= arg_idx) {
diff --git a/examples/quantize/test.sh b/examples/quantize/test.sh
new file mode 100644
index 00000000..840f712a
--- /dev/null
+++ b/examples/quantize/test.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+ echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
+ echo "example: $0 ../../build/bin ../../tmp"
+ exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+ TMP_DIR=$2
+else
+ TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+QUANTIZE=$1/quantize
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/quantize
+CUR_DIR=$(pwd)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
+
+# 1. Get a model
+(
+ cd $WORK_PATH
+ "$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split model
+$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 3. Requant model with '--keep_split'
+$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+echo PASS
+echo
+
+# 3a. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Requant mode without '--keep_split'
+$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
+echo PASS
+echo
+
+# 4b. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf