summaryrefslogtreecommitdiff
path: root/src/llama.cpp
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2024-09-28 08:17:19 +0300
committerGitHub <noreply@github.com>2024-09-28 08:17:19 +0300
commit1f61e91862dd0b077ccb60459f3cc03f364ee279 (patch)
treedea91fadd7bb37077f04e10858732ac3903b632d /src/llama.cpp
parent6dec4af4b6e65eb72e646a6f8b10d77c9d306281 (diff)
Better sub-3-bit quantization mixes with a qkv tensor (#64)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'src/llama.cpp')
-rw-r--r--src/llama.cpp8
1 files changed, 6 insertions, 2 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index df57c071..2cca5099 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15665,6 +15665,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
new_type = GGML_TYPE_Q4_K;
}
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_K;
+ }
else if (name.find("ffn_down") != std::string::npos) {
if (qs.i_ffn_down < qs.n_ffn_down/8) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -15770,7 +15773,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
- new_type = GGML_TYPE_Q4_K;
+ new_type = GGML_TYPE_IQ4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
@@ -15821,9 +15824,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
}
}
else if (name.find("attn_qkv.weight") != std::string::npos) {
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = GGML_TYPE_Q4_K;
}
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}