diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-03-22 19:47:14 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-22 20:47:14 +0200 |
commit | 1d0331c12a2f2a6296b471232bd4e66fbf06e6a1 (patch) | |
tree | 4417697e55b3a70c97c6655b37491a485a3b9797 /llama.h | |
parent | dba1af612926cbd4ebe2d876277af1e3305177e0 (diff) |
quantize: options for output and token embedding tensors qtype (#6239)
* quantize: be able to specify the output tensor type
* quantize: be able to specify the token embedding tensor type
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'llama.h')
-rw-r--r-- | llama.h | 16 |
1 files changed, 9 insertions, 7 deletions
@@ -275,13 +275,15 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { - int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // quantize all tensors to the default type - void * imatrix; // pointer to importance matrix data + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + enum ggml_type output_tensor_type; // output tensor type + enum ggml_type token_embedding_type; // itoken embeddings tensor type + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // quantize all tensors to the default type + void * imatrix; // pointer to importance matrix data } llama_model_quantize_params; // grammar types |