summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/llama.cpp34
1 files changed, 18 insertions, 16 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index 1384123a..a55254c0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8326,22 +8326,24 @@ static struct ggml_tensor * llm_build_moe_ffn(
ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(gate, "ffn_moe_gate", il);
- switch (type_op) {
- case LLM_FFN_SILU:
- {
- gate = ggml_silu(ctx, gate);
- cb(gate, "ffn_moe_silu", il);
- } break;
- case LLM_FFN_GELU:
- {
- gate = ggml_gelu(ctx, gate);
- cb(gate, "ffn_moe_gelu", il);
- } break;
- default:
- GGML_ABORT("fatal error");
- }
-
- ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
+ // This is equivalent to the commented out code below
+ ggml_tensor * par = ggml_fused_mul_unary(ctx, gate, up, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
+
+ //switch (type_op) {
+ // case LLM_FFN_SILU:
+ // {
+ // gate = ggml_silu(ctx, gate);
+ // cb(gate, "ffn_moe_silu", il);
+ // } break;
+ // case LLM_FFN_GELU:
+ // {
+ // gate = ggml_gelu(ctx, gate);
+ // cb(gate, "ffn_moe_gelu", il);
+ // } break;
+ // default:
+ // GGML_ABORT("fatal error");
+ //}
+ //ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
cb(par, "ffn_moe_gate_par", il);
ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]