Honor attn_output specified in the command line also for low-bit quants

author: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2025-02-20 17:42:07 +0200
committer: Iwan Kawrakow <iwan.kawrakow@gmail.com> 2025-02-20 17:42:07 +0200
commit: 4b45b82e67d9362e7522e5c7107e9d99219e0432 (patch)
tree: 726d7d4f6ec3ce53a14171d3db7d8a9531c22f75 /src
parent: a45da7bfbf75503fe9e5a2f675db7825afdc6310 (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index 0257a0a3..28e887ee 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16339,7 +16339,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             ++qs.i_ffn_down;
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
-            if (qs.model.hparams.n_expert >= 4) {
+            if (qs.params->attn_output_type < GGML_TYPE_COUNT) new_type = qs.params->attn_output_type;
+            else if (qs.model.hparams.n_expert >= 4) {
                 new_type = GGML_TYPE_Q5_K;
             } else {
                 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K;
author	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2025-02-20 17:42:07 +0200
committer	Iwan Kawrakow <iwan.kawrakow@gmail.com>	2025-02-20 17:42:07 +0200
commit	4b45b82e67d9362e7522e5c7107e9d99219e0432 (patch)
tree	726d7d4f6ec3ce53a14171d3db7d8a9531c22f75 /src
parent	a45da7bfbf75503fe9e5a2f675db7825afdc6310 (diff)