diff options
author | Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> | 2024-10-18 09:48:15 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-18 09:48:15 +0200 |
commit | 03cabe15401adba617c672ce8708a3501be8f15c (patch) | |
tree | eda129c338de73c9c8e21a9b70eb9dc02480b3ad /include/llama.h | |
parent | 76b97c80645362ac65a2e33043fd8d46bdaf8c56 (diff) |
CLI - Specify GGML_TYPE to quantize for the main tensors. (#91)
To complement the token_embd.weight and output.weight :
attn_v.weight
attn_k.weight.
attn_q_weight
attn_output.weight
attn_qkv.weight
ffn_gate
ffn_down
ffn_up
Diffstat (limited to 'include/llama.h')
-rw-r--r-- | include/llama.h | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/include/llama.h b/include/llama.h index 133c2f0e..b2906693 100644 --- a/include/llama.h +++ b/include/llama.h @@ -361,6 +361,14 @@ extern "C" { enum llama_ftype ftype; // quantize to this llama_ftype enum ggml_type output_tensor_type; // output tensor type enum ggml_type token_embedding_type; // token embeddings tensor type + enum ggml_type attn_q_type; // attention query tensor type + enum ggml_type attn_k_type; // attention key tensor type + enum ggml_type attn_v_type; // attention value tensor type + enum ggml_type attn_qkv_type; // attention query-key-value tensor type + enum ggml_type attn_output_type; // attention output tensor type + enum ggml_type ffn_gate_type; // feedforward network gate type + enum ggml_type ffn_down_type; // feedforward network down type + enum ggml_type ffn_up_type; // feedforward network up type bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored |