diff options
Diffstat (limited to 'src/llama.cpp')
-rw-r--r-- | src/llama.cpp | 48 |
1 files changed, 23 insertions, 25 deletions
diff --git a/src/llama.cpp b/src/llama.cpp index 11a7060c..a1821d2d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9870,6 +9870,28 @@ llm_expert_gating_func_type gating_op, cb(cur, "ffn_moe_weighted", il); } +#ifdef GGML_USE_VULKAN + // aggregate experts + ggml_tensor * moe_out = nullptr; + //ggml_tensor * first_expert = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, + experts->nb[2], i*experts->nb[1]); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx, moe_out, cur_expert); + } + } + + if (n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx, moe_out); + } + + return moe_out; +#else if (n_expert_used == 1) { return ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0)); } @@ -9878,32 +9900,8 @@ llm_expert_gating_func_type gating_op, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], experts->nb[1])); } return ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used); +#endif - //// aggregate experts - //ggml_tensor * moe_out = nullptr; - ////ggml_tensor * first_expert = nullptr; - //for (int i = 0; i < n_expert_used; ++i) { - // ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, - // experts->nb[2], i*experts->nb[1]); - - // if (i == 0) { - // moe_out = cur_expert; - // //first_expert = cur_expert; - // //printf("%s: %d: %d x %d x %d x %d | %d x %d x %d x %d\n", __func__, ggml_is_contiguous(first_expert), - // // (int)cur_expert->ne[0], (int)cur_expert->ne[1], (int)cur_expert->ne[2], (int)cur_expert->ne[3], - // // (int)cur_expert->nb[0], (int)cur_expert->nb[1], (int)cur_expert->nb[2], (int)cur_expert->nb[3]); - // } else { - // moe_out = ggml_add(ctx, moe_out, cur_expert); - // //printf("%s: %d %d\n", __func__, ggml_is_contiguous(cur_expert), ggml_are_same_shape(cur_expert, first_expert)); - // } - //} - - //if (n_expert_used == 1) { - // // avoid returning a non-contiguous tensor - // moe_out = ggml_cont(ctx, moe_out); - //} - - //return moe_out; } static struct ggml_tensor * llm_build_kqv( |