summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples/quantize/quantize.cpp1
-rw-r--r--llama.cpp24
-rw-r--r--llama.h1
3 files changed, 17 insertions, 9 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index d27ea5e9..f878f691 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -18,6 +18,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
+ { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
diff --git a/llama.cpp b/llama.cpp
index bd219d49..d39ff94c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2586,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
// K-quants
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -8955,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
- } else if (name.find("ffn_down.weight") != std::string::npos) {
+ } else if (name.find("ffn_down") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
+ }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
}
@@ -8967,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (arch == LLM_ARCH_FALCON) {
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
}
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
new_type = GGML_TYPE_Q5_K;
}
++qs.i_feed_forward_w2;
@@ -8992,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
- }
+ // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
+ //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+ //}
// This can be used to reduce the size of the Q5_K_S model.
// The associated PPL increase is fully in line with the size reduction
//else {
@@ -9043,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9101,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
++qs.n_attention_wv;
}
- else if (name.find("ffn_down.weight") != std::string::npos) {
+ else if (name.find("ffn_down") != std::string::npos) {
++qs.n_feed_forward_w2;
}
}
diff --git a/llama.h b/llama.h
index 6fde113f..43d41b8f 100644
--- a/llama.h
+++ b/llama.h
@@ -105,6 +105,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
+ LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};