summaryrefslogtreecommitdiff
path: root/llama.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.h')
-rw-r--r--llama.h16
1 files changed, 9 insertions, 7 deletions
diff --git a/llama.h b/llama.h
index 7e8ac4b6..74f0e56d 100644
--- a/llama.h
+++ b/llama.h
@@ -275,13 +275,15 @@ extern "C" {
// model quantization parameters
typedef struct llama_model_quantize_params {
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
- enum llama_ftype ftype; // quantize to this llama_ftype
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
- bool quantize_output_tensor; // quantize output.weight
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
- bool pure; // quantize all tensors to the default type
- void * imatrix; // pointer to importance matrix data
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+ enum llama_ftype ftype; // quantize to this llama_ftype
+ enum ggml_type output_tensor_type; // output tensor type
+ enum ggml_type token_embedding_type; // itoken embeddings tensor type
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
+ bool quantize_output_tensor; // quantize output.weight
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+ bool pure; // quantize all tensors to the default type
+ void * imatrix; // pointer to importance matrix data
} llama_model_quantize_params;
// grammar types