Allow quantize to only copy tensors, some other improvements (#2931)

* Allow quantize tool to only copy tensors to allow repackaging models. * Slightly better logic when requantizing. * Change help message to go to `stdout`.
author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> 2023-09-01 08:02:48 -0600
committer: GitHub <noreply@github.com> 2023-09-01 08:02:48 -0600
commit: 5d6f19f16b2173afe2d5c6aee2f5c9fc31038eba (patch)
tree: 5e4ea604f958f75f7c63e2ecdd315b934c0c661b /llama.h
parent: 0d5893668625456c94bbadfddc53fc69cd51c223 (diff)
1 files changed, 1 insertions, 0 deletions
diff --git a/llama.h b/llama.h
index 6e5e1df6..422f2852 100644
--- a/llama.h
+++ b/llama.h
@@ -164,6 +164,7 @@ extern "C" {
         enum llama_ftype ftype;      // quantize to this llama_ftype
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
+        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
     } llama_model_quantize_params;
 
     // grammar types
author	Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>	2023-09-01 08:02:48 -0600
committer	GitHub <noreply@github.com>	2023-09-01 08:02:48 -0600
commit	5d6f19f16b2173afe2d5c6aee2f5c9fc31038eba (patch)
tree	5e4ea604f958f75f7c63e2ecdd315b934c0c661b /llama.h
parent	0d5893668625456c94bbadfddc53fc69cd51c223 (diff)