diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 43 |
1 files changed, 34 insertions, 9 deletions
@@ -4776,7 +4776,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; - if (nx % QK_K == 0) { + if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + new_type = GGML_TYPE_Q8_0; + } + else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } } else if (name.find("attn_v.weight") != std::string::npos) { @@ -4800,17 +4803,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K + : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (model.arch == LLM_ARCH_FALCON) { + new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) { + new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + if (model.arch != LLM_ARCH_FALCON) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + } + } + else if (name.find("attn_qkv.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; |