From 03cabe15401adba617c672ce8708a3501be8f15c Mon Sep 17 00:00:00 2001 From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:48:15 +0200 Subject: CLI - Specify GGML_TYPE to quantize for the main tensors. (#91) To complement the token_embd.weight and output.weight : attn_v.weight attn_k.weight. attn_q_weight attn_output.weight attn_qkv.weight ffn_gate ffn_down ffn_up --- include/llama.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/llama.h') diff --git a/include/llama.h b/include/llama.h index 133c2f0e..b2906693 100644 --- a/include/llama.h +++ b/include/llama.h @@ -361,6 +361,14 @@ extern "C" { enum llama_ftype ftype; // quantize to this llama_ftype enum ggml_type output_tensor_type; // output tensor type enum ggml_type token_embedding_type; // token embeddings tensor type + enum ggml_type attn_q_type; // attention query tensor type + enum ggml_type attn_k_type; // attention key tensor type + enum ggml_type attn_v_type; // attention value tensor type + enum ggml_type attn_qkv_type; // attention query-key-value tensor type + enum ggml_type attn_output_type; // attention output tensor type + enum ggml_type ffn_gate_type; // feedforward network gate type + enum ggml_type ffn_down_type; // feedforward network down type + enum ggml_type ffn_up_type; // feedforward network up type bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored -- cgit v1.2.3