From 737514fd814d944f8ce965620293a16e5e8a285d Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sat, 28 Sep 2024 13:37:25 +0300 Subject: Adding SWIGLU unary op (#65) * Adding GGML_UNARY_OP_SWIGLU This commit implements the ggml op and CPU compute forward. I see ~3-4% speedup of PP-512 for Phi-3.5-mini. * GGML_UNARY_OP_SWIGLU: CUDA implementation I observe ~12% speedup for PP-512(Phi-3.5-mini). * GGML_UNARY_OP_SWIGLU: Metal implementation We get ~2% speedup for PP-512(Phi-3.5-mini). * GGML_UNARY_OP_SWIGLU: minor improvement on Metal * GGML_UNARY_OP_SWIGLU: cleanup --------- Co-authored-by: Iwan Kawrakow --- src/llama.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'src/llama.cpp') diff --git a/src/llama.cpp b/src/llama.cpp index 2cca5099..d52590a6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8111,16 +8111,8 @@ static struct ggml_tensor * llm_build_ffn( } break; case LLM_FFN_SWIGLU: { - // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf - int64_t split_point = cur->ne[0] / 2; - struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0)); - struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_silu(ctx, x0); - cb(cur, "ffn_silu", il); - - cur = ggml_mul(ctx, x0, x1); - cb(cur, "ffn_mul", il); + cur = ggml_swiglu(ctx, cur); + cb(cur, "ffn_swiglu", il); } break; } -- cgit v1.2.3