From a325745000114a43c1546323f91720db503ed0a9 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:15:06 +0300 Subject: iq4_k: use iq5_k also when n_gqa = 2 (#23) This improves size vs quality balance for Gemma-2 models. Co-authored-by: Iwan Kawrakow --- src/llama.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src/llama.cpp') diff --git a/src/llama.cpp b/src/llama.cpp index ba18a37c..17253f7a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15702,9 +15702,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_K) && qs.model.hparams.n_gqa() >= 4) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_K && qs.model.hparams.n_gqa() >= 2) { + new_type = GGML_TYPE_IQ5_K; + } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; -- cgit v1.2.3