diff options
author | jiez <373447296@qq.com> | 2024-04-25 18:29:35 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-25 13:29:35 +0300 |
commit | 1966eb2615242f224bf9ca939db8905ab6a174a0 (patch) | |
tree | 3da33a1b5f816723e195a4936d44c4bef2eaa06a /examples/quantize/quantize.cpp | |
parent | 784e11dea1f5ce9638851b2b0dddb107e2a609c8 (diff) |
quantize : add '--keep-split' to quantize model into shards (#6688)
* Implement '--keep-split' to quantize model into several shards
* Add test script
* Update examples/quantize/quantize.cpp
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Split model correctly even if tensor id is out-of-order
* Update llama_model_quantize_params
* Fix preci failures
---------
Co-authored-by: z5269887 <z5269887@unsw.edu.au>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/quantize/quantize.cpp')
-rw-r--r-- | examples/quantize/quantize.cpp | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 64cb6db1..da1850df 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -97,6 +97,7 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --keep-split: will generate quatized model in the same shards as input"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); @@ -300,6 +301,8 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--keep-split")) { + params.keep_split = true; } else { usage(argv[0]); } @@ -332,20 +335,28 @@ int main(int argc, char ** argv) { std::string fname_out; std::string ftype_str; + std::string suffix = ".gguf"; if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { std::string fpath; const size_t pos = fname_inp.find_last_of("/\\"); if (pos != std::string::npos) { fpath = fname_inp.substr(0, pos + 1); } - // export as [inp path]/ggml-model-[ftype].gguf - fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; + + // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting + fname_out = fpath + "ggml-model-" + ftype_str; + if (!params.keep_split) { + fname_out += suffix; + } arg_idx++; if (ftype_str == "COPY") { params.only_copy = true; } } else { fname_out = argv[arg_idx]; + if (params.keep_split && fname_out.find(suffix) != std::string::npos) { + fname_out = fname_out.substr(0, fname_out.length() - suffix.length()); + } arg_idx++; if (argc <= arg_idx) { |