summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorNexes the Elder <124105151+Nexesenex@users.noreply.github.com>2024-10-18 09:48:15 +0200
committerGitHub <noreply@github.com>2024-10-18 09:48:15 +0200
commit03cabe15401adba617c672ce8708a3501be8f15c (patch)
treeeda129c338de73c9c8e21a9b70eb9dc02480b3ad /include
parent76b97c80645362ac65a2e33043fd8d46bdaf8c56 (diff)
CLI - Specify GGML_TYPE to quantize for the main tensors. (#91)
To complement the token_embd.weight and output.weight : attn_v.weight attn_k.weight. attn_q_weight attn_output.weight attn_qkv.weight ffn_gate ffn_down ffn_up
Diffstat (limited to 'include')
-rw-r--r--include/llama.h8
1 files changed, 8 insertions, 0 deletions
diff --git a/include/llama.h b/include/llama.h
index 133c2f0e..b2906693 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -361,6 +361,14 @@ extern "C" {
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
+ enum ggml_type attn_q_type; // attention query tensor type
+ enum ggml_type attn_k_type; // attention key tensor type
+ enum ggml_type attn_v_type; // attention value tensor type
+ enum ggml_type attn_qkv_type; // attention query-key-value tensor type
+ enum ggml_type attn_output_type; // attention output tensor type
+ enum ggml_type ffn_gate_type; // feedforward network gate type
+ enum ggml_type ffn_down_type; // feedforward network down type
+ enum ggml_type ffn_up_type; // feedforward network up type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored