Better sub-3-bit quantization mixes with a qkv tensor (#64)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2024-09-28 08:17:19 +0300
committer: GitHub <noreply@github.com> 2024-09-28 08:17:19 +0300
commit: 1f61e91862dd0b077ccb60459f3cc03f364ee279 (patch)
tree: dea91fadd7bb37077f04e10858732ac3903b632d /src/llama.cpp
parent: 6dec4af4b6e65eb72e646a6f8b10d77c9d306281 (diff)
1 files changed, 6 insertions, 2 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index df57c071..2cca5099 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15665,6 +15665,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
+        else if (name.find("attn_qkv.weight") != std::string::npos) {
+            new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_K;
+        }
         else if (name.find("ffn_down") != std::string::npos) {
             if (qs.i_ffn_down < qs.n_ffn_down/8) {
                 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -15770,7 +15773,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
                     (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
-            new_type = GGML_TYPE_Q4_K;
+            new_type = GGML_TYPE_IQ4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
             new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
@@ -15821,9 +15824,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         }
     }
     else if (name.find("attn_qkv.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
             new_type = GGML_TYPE_Q4_K;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     }
author	Kawrakow <iwankawrakow@gmail.com>	2024-09-28 08:17:19 +0300
committer	GitHub <noreply@github.com>	2024-09-28 08:17:19 +0300
commit	1f61e91862dd0b077ccb60459f3cc03f364ee279 (patch)
tree	dea91fadd7bb37077f04e10858732ac3903b632d /src/llama.cpp
parent	6dec4af4b6e65eb72e646a6f8b10d77c9d306281 (diff)