Fix attn_v conditionality (#604)

To retain compatibility with : https://github.com/ikawrakow/ik_llama.cpp/pull/91 We need "else if" and not "if", otherwise the MOE and 70b condition takes precedence over the specified quant in the CLI.
author: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> 2025-07-13 11:28:18 +0200
committer: GitHub <noreply@github.com> 2025-07-13 11:28:18 +0200
commit: e2b1a5e1fcb3ad55eae03c58c986a21e842ff7a4 (patch)
tree: c556fff01272b4a42b015c0f7cabe1a4fe452631
parent: b5ddec9516c837a40f97e3bb5e96ccebdd30d4f5 (diff)
1 files changed, 12 insertions, 12 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index 8e6c66d3..5777689e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19447,6 +19447,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         }
     } else if (name.find("attn_v.weight") != std::string::npos) {
         if      (qs.params->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params->attn_v_type;
+        else if (qs.model.hparams.n_expert >= 4) {
+            // for the 4-8-expert model, bumping this to Q8_0 trades just ~128MB
+            // TODO: explore better strategies
+            new_type = GGML_TYPE_Q8_0;
+        }
+        else if (qs.model.type == MODEL_70B) {
+            // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
+            // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
+            // nearly negligible increase in model size by quantizing this tensor with more bits:
+            if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
+            if (new_type == GGML_TYPE_IQ3_K) new_type = GGML_TYPE_IQ5_K;
+        }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
             new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
@@ -19531,18 +19543,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B))
                 new_type = GGML_TYPE_IQ6_K;
         }
-        if (qs.model.type == MODEL_70B) {
-            // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
-            // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
-            // nearly negligible increase in model size by quantizing this tensor with more bits:
-            if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
-            if (new_type == GGML_TYPE_IQ3_K) new_type = GGML_TYPE_IQ5_K;
-        }
-        if (qs.model.hparams.n_expert >= 4) {
-            // for the 4-8-expert model, bumping this to Q8_0 trades just ~128MB
-            // TODO: explore better strategies
-            new_type = GGML_TYPE_Q8_0;
-        }
         else if (qs.model.hparams.n_gqa() >= 4) {
             if      (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
             else if (new_type == GGML_TYPE_Q2_K_R4 || new_type == GGML_TYPE_IQ3_XXS_R4) new_type = GGML_TYPE_IQ3_K_R4;
author	Nexes the Elder <124105151+Nexesenex@users.noreply.github.com>	2025-07-13 11:28:18 +0200
committer	GitHub <noreply@github.com>	2025-07-13 11:28:18 +0200
commit	e2b1a5e1fcb3ad55eae03c58c986a21e842ff7a4 (patch)
tree	c556fff01272b4a42b015c0f7cabe1a4fe452631
parent	b5ddec9516c837a40f97e3bb5e96ccebdd30d4f5 (diff)