From ac1d259b93eccfa7371c6b00c5749400ff2b2aea Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Sun, 23 Feb 2025 14:31:11 +0200
Subject: Fused MoE ffn_up and ffn_gate (#229)

* Fusing MoE up * unary(gate)

* Fusing MoE up * unary(gate): CUDA

We get ~13% speedup for PP-512 and ~2% for TG-128
for DeepSeek-Lite

* On CUDA also fuse MoE down * (up * unary(gate))

in case the MUL_MAT_ID op for the down experts is the next
op in the graph.

* Command line option to enable fused MoE up*unary(gate)

* Add fmoe option to llama-bench

* Adding forgotten gelu, relu, silu on ARM

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 common/common.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'common/common.h')

diff --git a/common/common.h b/common/common.h
index b5b67986..f86a58cb 100644
--- a/common/common.h
+++ b/common/common.h
@@ -175,6 +175,7 @@ struct gpt_params {
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
     bool mla_attn          = false; // MLA
+    bool fused_moe_up_gate = false; // fused up*unary(gate) op for MoE models
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
-- 
cgit v1.2.3