From 03cabe15401adba617c672ce8708a3501be8f15c Mon Sep 17 00:00:00 2001
From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com>
Date: Fri, 18 Oct 2024 09:48:15 +0200
Subject: CLI - Specify GGML_TYPE to quantize for the main tensors. (#91)

To complement the token_embd.weight and output.weight :

attn_v.weight
attn_k.weight.
attn_q_weight
attn_output.weight
attn_qkv.weight
ffn_gate
ffn_down
ffn_up
---
 include/llama.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/llama.h')

diff --git a/include/llama.h b/include/llama.h
index 133c2f0e..b2906693 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -361,6 +361,14 @@ extern "C" {
         enum llama_ftype ftype;              // quantize to this llama_ftype
         enum ggml_type output_tensor_type;   // output tensor type
         enum ggml_type token_embedding_type; // token embeddings tensor type
+        enum ggml_type attn_q_type;          // attention query tensor type
+        enum ggml_type attn_k_type;          // attention key tensor type
+        enum ggml_type attn_v_type;          // attention value tensor type
+        enum ggml_type attn_qkv_type;        // attention query-key-value tensor type
+        enum ggml_type attn_output_type;     // attention output tensor type
+        enum ggml_type ffn_gate_type;        // feedforward network gate type
+        enum ggml_type ffn_down_type;        // feedforward network down type
+        enum ggml_type ffn_up_type;          // feedforward network up type
         bool allow_requantize;               // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor;         // quantize output.weight
         bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-- 
cgit v1.2.3