summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-01-14 10:53:39 +0200
committerGitHub <noreply@github.com>2024-01-14 10:53:39 +0200
commita128c38de862431f1aae9ccc40b792fbc1b8b682 (patch)
tree2946ef20e083b883c325fed2bc0a11d1ca84166d
parent5f5fe1bd608fa2ed42af97b5f2ea31be6625fc48 (diff)
Fix ffn_down quantization mix for MoE models (#4927)
* Fix ffn_down quantization mix for MoE models In #4872 I did not consider the part where every third tensor is quantized with more bits. Fir MoE this leads to tensors of the same layer being quantized with different number of bits, which is not considered as a possibility in the inference implementation (it is assumed all experts use the same quantization). * Fix the fix * Review suggestion --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
-rw-r--r--llama.cpp34
1 files changed, 26 insertions, 8 deletions
diff --git a/llama.cpp b/llama.cpp
index 51e9bdae..b1d6015e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8480,13 +8480,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
new_type = GGML_TYPE_Q8_0;
}
} else if (name.find("ffn_down") != std::string::npos) {
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
+ int i_layer, n_layer;
+ if (n_expert == 1) {
+ i_layer = qs.i_feed_forward_w2;
+ n_layer = qs.n_feed_forward_w2;
+ } else {
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
+ // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
+ // tensor name.
+ n_layer = qs.n_feed_forward_w2 / n_expert;
+ if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
+ }
+ if (i_layer < 0 || i_layer >= n_layer) {
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
+ }
+ }
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
- : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
+ : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -8494,14 +8512,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (arch == LLM_ARCH_FALCON) {
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
- use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
+ use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
- if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
}
}
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
new_type = GGML_TYPE_Q5_K;
}
++qs.i_feed_forward_w2;