summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2023-08-27 15:19:59 +0300
committerGitHub <noreply@github.com>2023-08-27 15:19:59 +0300
commita6d1189fdd4c1ab4ba23f9d777f8950901dcffb2 (patch)
treed9a96c3adbf57aad406bfb5bd304765615933060 /llama.cpp
parentc48c5bb0b06385f6c708339188d2aaf2bc278477 (diff)
k_quants tuning for Falcon-7b (#2816)
* Make ggml-cuda.cu build with QK_K = 64 Using LLAMA_CUDA_FORCE_DMMV = ON and -nommq it runs and produces a meaningful result. * k_quants tuning for Falcon-7b --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp43
1 files changed, 34 insertions, 9 deletions
diff --git a/llama.cpp b/llama.cpp
index df103a6e..e9868f5d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4776,7 +4776,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
int nx = tensor->ne[0];
- if (nx % QK_K == 0) {
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
+ new_type = GGML_TYPE_Q8_0;
+ }
+ else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
}
} else if (name.find("attn_v.weight") != std::string::npos) {
@@ -4800,17 +4803,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else if (name.find("ffn_down.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
+ : GGML_TYPE_Q3_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+ if (model.arch == LLM_ARCH_FALCON) {
+ new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+ } else {
+ if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+ }
+ }
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
+ new_type = GGML_TYPE_Q5_K;
}
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
++i_feed_forward_w2;
} else if (name.find("attn_output.weight") != std::string::npos) {
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+ if (model.arch != LLM_ARCH_FALCON) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+ } else {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
+ }
+ }
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;