summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Tsochantaris <ptsochantaris@icloud.com>2024-02-09 10:48:06 +0000
committerGitHub <noreply@github.com>2024-02-09 12:48:06 +0200
commite5ca3937c685d6e012ac4db40555d6ec100ff03c (patch)
tree9fd9668a2b9774a5602839c6e69a185ad0f44c8c
parente4124c24775f2cb5b3d7acc93bf9dc5471c172ef (diff)
llama : do not cap thread count when MoE on CPU (#5419)
* Not capping thread count when MoE inference is running on CPU * Whitespace
-rw-r--r--llama.cpp4
1 files changed, 3 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index db7d1c1c..0566b087 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7285,7 +7285,9 @@ static int llama_decode_internal(
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
// with the BLAS calls. need a better solution
- if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+ // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
+ // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
+ if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
n_threads = std::min(4, n_threads);
}