From 1d0331c12a2f2a6296b471232bd4e66fbf06e6a1 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 22 Mar 2024 19:47:14 +0100 Subject: quantize: options for output and token embedding tensors qtype (#6239) * quantize: be able to specify the output tensor type * quantize: be able to specify the token embedding tensor type --------- Co-authored-by: Iwan Kawrakow --- llama.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'llama.h') diff --git a/llama.h b/llama.h index 7e8ac4b6..74f0e56d 100644 --- a/llama.h +++ b/llama.h @@ -275,13 +275,15 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { - int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // quantize all tensors to the default type - void * imatrix; // pointer to importance matrix data + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + enum ggml_type output_tensor_type; // output tensor type + enum ggml_type token_embedding_type; // itoken embeddings tensor type + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // quantize all tensors to the default type + void * imatrix; // pointer to importance matrix data } llama_model_quantize_params; // grammar types -- cgit v1.2.3