summaryrefslogtreecommitdiff
path: root/ggml/include/ggml.h
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-02-23 14:31:11 +0200
committerGitHub <noreply@github.com>2025-02-23 14:31:11 +0200
commitac1d259b93eccfa7371c6b00c5749400ff2b2aea (patch)
treefe8bb34c9dcbea805595c5087f00b188bb89fc05 /ggml/include/ggml.h
parent46bf73a37f1aabe6f0b40365b0c7b2ba831905f5 (diff)
Fused MoE ffn_up and ffn_gate (#229)
* Fusing MoE up * unary(gate) * Fusing MoE up * unary(gate): CUDA We get ~13% speedup for PP-512 and ~2% for TG-128 for DeepSeek-Lite * On CUDA also fuse MoE down * (up * unary(gate)) in case the MUL_MAT_ID op for the down experts is the next op in the graph. * Command line option to enable fused MoE up*unary(gate) * Add fmoe option to llama-bench * Adding forgotten gelu, relu, silu on ARM --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/include/ggml.h')
-rw-r--r--ggml/include/ggml.h10
1 files changed, 10 insertions, 0 deletions
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index d2131a15..d12b90d0 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -567,6 +567,7 @@ extern "C" {
GGML_OP_MUL_MAT,
GGML_OP_MUL_MAT_ID,
GGML_OP_OUT_PROD,
+ GGML_OP_MOE_FUSED_UP_GATE,
GGML_OP_SCALE,
GGML_OP_SET,
@@ -1320,6 +1321,15 @@ extern "C" {
struct ggml_tensor * b,
struct ggml_tensor * ids);
+ // MoE up + gate + unary
+ GGML_API struct ggml_tensor * ggml_moe_up_gate(
+ struct ggml_context * ctx,
+ struct ggml_tensor * as_up,
+ struct ggml_tensor * as_gate,
+ struct ggml_tensor * b,
+ struct ggml_tensor * ids,
+ enum ggml_unary_op op);
+
// A: m columns, n rows,
// B: p columns, n rows,
// result is m columns, p rows