From 334a835a1ccc8106a5fa355683a965efb1bfa24b Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Tue, 16 Jan 2024 19:51:26 +0200 Subject: ggml : importance matrix support for legacy quants (#4969) * imatrix: adding support for legacy quants * imatrix: guard Q4_0/Q5_0 against ffn_down craziness --------- Co-authored-by: Iwan Kawrakow --- llama.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 46c4d11c..765d20dd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8374,6 +8374,8 @@ struct quantize_state_internal { int n_k_quantized = 0; int n_fallback = 0; + bool has_imatrix = false; + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) : model(model) , params(params) @@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = GGML_TYPE_Q5_K; } + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) + && qs.has_imatrix && i_layer < n_layer/8) { + // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. + // We only do it when an imatrix is provided because a) we want to make sure that one can always get the + // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. + new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; + } ++qs.i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { @@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s imatrix_data = static_cast>*>(params->imatrix); if (imatrix_data) { LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); + qs.has_imatrix = true; } } -- cgit v1.2.3