summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/llama.cpp8
1 files changed, 8 insertions, 0 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index cae91619..0aaedb96 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15756,6 +15756,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B))
+ new_type = GGML_TYPE_Q6_K;
+ }
if (qs.model.type == MODEL_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15796,6 +15800,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
}
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B))
+ new_type = GGML_TYPE_Q4_K;
+ }
} else if (name.find("ffn_down") != std::string::npos) {
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
int i_layer = info.first, n_layer = info.second;