summaryrefslogtreecommitdiff
path: root/examples/quantize/quantize.cpp
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-09-01 08:02:48 -0600
committerGitHub <noreply@github.com>2023-09-01 08:02:48 -0600
commit5d6f19f16b2173afe2d5c6aee2f5c9fc31038eba (patch)
tree5e4ea604f958f75f7c63e2ecdd315b934c0c661b /examples/quantize/quantize.cpp
parent0d5893668625456c94bbadfddc53fc69cd51c223 (diff)
Allow quantize to only copy tensors, some other improvements (#2931)
* Allow quantize tool to only copy tensors to allow repackaging models. * Slightly better logic when requantizing. * Change help message to go to `stdout`.
Diffstat (limited to 'examples/quantize/quantize.cpp')
-rw-r--r--examples/quantize/quantize.cpp24
1 files changed, 19 insertions, 5 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index df9a214f..c174be06 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
+ // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
+ { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
};
@@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
void usage(const char * executable) {
- fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
- fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
- fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
- fprintf(stderr, "\nAllowed quantization types:\n");
+ printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+ printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+ printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+ printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
- printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
+ if (it.name != "COPY") {
+ printf(" %2d or ", it.ftype);
+ } else {
+ printf(" ");
+ }
+ printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
}
exit(1);
}
@@ -121,6 +128,9 @@ int main(int argc, char ** argv) {
// export as [inp path]/ggml-model-[ftype].gguf
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
arg_idx++;
+ if (ftype_str == "COPY") {
+ params.only_copy = true;
+ }
}
else {
fname_out = argv[arg_idx];
@@ -133,6 +143,10 @@ int main(int argc, char ** argv) {
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1;
+ } else {
+ if (ftype_str == "COPY") {
+ params.only_copy = true;
+ }
}
arg_idx++;
}