Adding SWIGLU unary op (#65)

* Adding GGML_UNARY_OP_SWIGLU This commit implements the ggml op and CPU compute forward. I see ~3-4% speedup of PP-512 for Phi-3.5-mini. * GGML_UNARY_OP_SWIGLU: CUDA implementation I observe ~12% speedup for PP-512(Phi-3.5-mini). * GGML_UNARY_OP_SWIGLU: Metal implementation We get ~2% speedup for PP-512(Phi-3.5-mini). * GGML_UNARY_OP_SWIGLU: minor improvement on Metal * GGML_UNARY_OP_SWIGLU: cleanup --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2024-09-28 13:37:25 +0300
committer: GitHub <noreply@github.com> 2024-09-28 13:37:25 +0300
commit: 737514fd814d944f8ce965620293a16e5e8a285d (patch)
tree: 4b4b79eec0d1cbcc413dd3c6991b6d57439edd86 /src/llama.cpp
parent: 1f61e91862dd0b077ccb60459f3cc03f364ee279 (diff)
1 files changed, 2 insertions, 10 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index 2cca5099..d52590a6 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8111,16 +8111,8 @@ static struct ggml_tensor * llm_build_ffn(
             } break;
         case LLM_FFN_SWIGLU:
             {
-                // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-                int64_t split_point = cur->ne[0] / 2;
-                struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
-                struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
-
-                x0 = ggml_silu(ctx, x0);
-                cb(cur, "ffn_silu", il);
-
-                cur = ggml_mul(ctx, x0, x1);
-                cb(cur, "ffn_mul", il);
+                cur = ggml_swiglu(ctx, cur);
+                cb(cur, "ffn_swiglu", il);
             } break;
     }
author	Kawrakow <iwankawrakow@gmail.com>	2024-09-28 13:37:25 +0300
committer	GitHub <noreply@github.com>	2024-09-28 13:37:25 +0300
commit	737514fd814d944f8ce965620293a16e5e8a285d (patch)
tree	4b4b79eec0d1cbcc413dd3c6991b6d57439edd86 /src/llama.cpp
parent	1f61e91862dd0b077ccb60459f3cc03f364ee279 (diff)