summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-03-02 13:47:38 +0200
committerGitHub <noreply@github.com>2025-03-02 13:47:38 +0200
commita89adaa78f505675be7be6180f419b4b0158c15a (patch)
treead82fa3ad44f66f37885bdf0d0d025166eff9535 /include
parentef9a3d17b52bb5f6d55f7ef7e05e41e22f2ad81d (diff)
SER - Smart Expert Reduction (#239)
* A better way to measure the cost of ggml_barrier * Smart expert selection * Add ser option to llama-bench --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'include')
-rw-r--r--include/llama.h2
1 files changed, 2 insertions, 0 deletions
diff --git a/include/llama.h b/include/llama.h
index bb43aebc..38a12744 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -386,6 +386,8 @@ extern "C" {
int mla_attn; // whether to use MLA attention [EXPERIMENTAL]
int attn_max_batch; // maximum batch size for attention computations [EXPERIMENTAL]
bool fused_moe_up_gate; // whether to use fused MoE up/down op [EXPERIMENTAL]
+ int min_experts;
+ float thresh_experts;
// Abort callback
// if it returns true, execution of llama_decode() will be aborted