iq4_k: use iq5_k also when n_gqa = 2 (#23)

This improves size vs quality balance for Gemma-2 models. Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <48489457+ikawrakow@users.noreply.github.com> 2024-08-20 17:15:06 +0300
committer: GitHub <noreply@github.com> 2024-08-20 17:15:06 +0300
commit: a325745000114a43c1546323f91720db503ed0a9 (patch)
tree: bc695b57bba4136725e3cfc8c88b7779593ba909
parent: a73702d93b1007b2f528432c3db20c7aa5206352 (diff)
1 files changed, 4 insertions, 1 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index ba18a37c..17253f7a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15702,9 +15702,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_K) && qs.model.hparams.n_gqa() >= 4) {
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q5_K;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_K && qs.model.hparams.n_gqa() >= 2) {
+            new_type = GGML_TYPE_IQ5_K;
+        }
         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                 use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
author	Kawrakow <48489457+ikawrakow@users.noreply.github.com>	2024-08-20 17:15:06 +0300
committer	GitHub <noreply@github.com>	2024-08-20 17:15:06 +0300
commit	a325745000114a43c1546323f91720db503ed0a9 (patch)
tree	bc695b57bba4136725e3cfc8c88b7779593ba909
parent	a73702d93b1007b2f528432c3db20c7aa5206352 (diff)