Use fused mul - unary op also for MoE models (#111)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2024-10-26 18:23:54 +0200
committer: GitHub <noreply@github.com> 2024-10-26 18:23:54 +0200
commit: 5ad6439486e5bfdd8e34213a36beb56b74842bbe (patch)
tree: 0c7a9b077b232e663ae26f41fd1beed158aabc91 /src/llama.cpp
parent: 2e5f6db5de85de5cac416c93ae9ff02731498798 (diff)
1 files changed, 18 insertions, 16 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index 1384123a..a55254c0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8326,22 +8326,24 @@ static struct ggml_tensor * llm_build_moe_ffn(
     ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
     cb(gate, "ffn_moe_gate", il);
 
-    switch (type_op) {
-        case LLM_FFN_SILU:
-            {
-                gate = ggml_silu(ctx, gate);
-                cb(gate, "ffn_moe_silu", il);
-            } break;
-        case LLM_FFN_GELU:
-            {
-                gate = ggml_gelu(ctx, gate);
-                cb(gate, "ffn_moe_gelu", il);
-            } break;
-        default:
-            GGML_ABORT("fatal error");
-    }
-
-    ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
+    // This is equivalent to the commented out code below
+    ggml_tensor * par = ggml_fused_mul_unary(ctx, gate, up, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
+
+    //switch (type_op) {
+    //    case LLM_FFN_SILU:
+    //        {
+    //            gate = ggml_silu(ctx, gate);
+    //            cb(gate, "ffn_moe_silu", il);
+    //        } break;
+    //    case LLM_FFN_GELU:
+    //        {
+    //            gate = ggml_gelu(ctx, gate);
+    //            cb(gate, "ffn_moe_gelu", il);
+    //        } break;
+    //    default:
+    //        GGML_ABORT("fatal error");
+    //}
+    //ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
     cb(par, "ffn_moe_gate_par", il);
 
     ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
author	Kawrakow <iwankawrakow@gmail.com>	2024-10-26 18:23:54 +0200
committer	GitHub <noreply@github.com>	2024-10-26 18:23:54 +0200
commit	5ad6439486e5bfdd8e34213a36beb56b74842bbe (patch)
tree	0c7a9b077b232e663ae26f41fd1beed158aabc91 /src/llama.cpp
parent	2e5f6db5de85de5cac416c93ae9ff02731498798 (diff)