summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-01-16 19:51:26 +0200
committerGitHub <noreply@github.com>2024-01-16 19:51:26 +0200
commit334a835a1ccc8106a5fa355683a965efb1bfa24b (patch)
tree83172d25be464c4f041dd43eb56592c372b2c784 /llama.cpp
parent4feb4b33eeb1756e46084a4db9230b279af1a480 (diff)
ggml : importance matrix support for legacy quants (#4969)
* imatrix: adding support for legacy quants * imatrix: guard Q4_0/Q5_0 against ffn_down craziness --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp10
1 files changed, 10 insertions, 0 deletions
diff --git a/llama.cpp b/llama.cpp
index 46c4d11c..765d20dd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8374,6 +8374,8 @@ struct quantize_state_internal {
int n_k_quantized = 0;
int n_fallback = 0;
+ bool has_imatrix = false;
+
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
@@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
new_type = GGML_TYPE_Q5_K;
}
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
+ && qs.has_imatrix && i_layer < n_layer/8) {
+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
+ }
++qs.i_feed_forward_w2;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
@@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+ qs.has_imatrix = true;
}
}