CLI - Specify GGML_TYPE to quantize for the main tensors. (#91)

To complement the token_embd.weight and output.weight : attn_v.weight attn_k.weight. attn_q_weight attn_output.weight attn_qkv.weight ffn_gate ffn_down ffn_up
author: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> 2024-10-18 09:48:15 +0200
committer: GitHub <noreply@github.com> 2024-10-18 09:48:15 +0200
commit: 03cabe15401adba617c672ce8708a3501be8f15c (patch)
tree: eda129c338de73c9c8e21a9b70eb9dc02480b3ad /include/llama.h
parent: 76b97c80645362ac65a2e33043fd8d46bdaf8c56 (diff)
1 files changed, 8 insertions, 0 deletions
diff --git a/include/llama.h b/include/llama.h
index 133c2f0e..b2906693 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -361,6 +361,14 @@ extern "C" {
         enum llama_ftype ftype;              // quantize to this llama_ftype
         enum ggml_type output_tensor_type;   // output tensor type
         enum ggml_type token_embedding_type; // token embeddings tensor type
+        enum ggml_type attn_q_type;          // attention query tensor type
+        enum ggml_type attn_k_type;          // attention key tensor type
+        enum ggml_type attn_v_type;          // attention value tensor type
+        enum ggml_type attn_qkv_type;        // attention query-key-value tensor type
+        enum ggml_type attn_output_type;     // attention output tensor type
+        enum ggml_type ffn_gate_type;        // feedforward network gate type
+        enum ggml_type ffn_down_type;        // feedforward network down type
+        enum ggml_type ffn_up_type;          // feedforward network up type
         bool allow_requantize;               // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor;         // quantize output.weight
         bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
author	Nexes the Elder <124105151+Nexesenex@users.noreply.github.com>	2024-10-18 09:48:15 +0200
committer	GitHub <noreply@github.com>	2024-10-18 09:48:15 +0200
commit	03cabe15401adba617c672ce8708a3501be8f15c (patch)
tree	eda129c338de73c9c8e21a9b70eb9dc02480b3ad /include/llama.h
parent	76b97c80645362ac65a2e33043fd8d46bdaf8c56 (diff)