diff options
author | slaren <slarengh@gmail.com> | 2024-04-18 15:18:48 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-18 15:18:48 +0200 |
commit | 0d56246f4b9764158525d894b96606f6163c53a8 (patch) | |
tree | 43e57dfbbde67b701020fc3e2ac885e846925d26 /llama.cpp | |
parent | 03c0946d73c63ea73e1d85015b7088298443d438 (diff) |
ggml : group all experts in a single ggml_mul_mat_id (#6505)
* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy
* cuda : fix bin bcast with non-cont src0
* test-backend-ops : only run all mul mat tests for base types
* llama : disable moe offloading with SYCL
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 223 |
1 files changed, 138 insertions, 85 deletions
@@ -4495,6 +4495,13 @@ static bool llm_load_tensors( auto & hparams = model.hparams; +#ifdef GGML_USE_SYCL + // disable MoE with SYCL until mul_mat_id is updated + if (hparams.n_expert > 0) { + n_gpu_layers = 0; + } +#endif + model.split_mode = split_mode; model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; @@ -6099,6 +6106,100 @@ static struct ggml_tensor * llm_build_ffn( return cur; } +static struct ggml_tensor * llm_build_moe_ffn( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * gate_inp, + struct ggml_tensor * up_exps, + struct ggml_tensor * gate_exps, + struct ggml_tensor * down_exps, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + const llm_build_cb & cb, + int il) { + int64_t n_embd = cur->ne[0]; + int64_t n_tokens = cur->ne[1]; + + ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens] + cb(logits, "ffn_moe_logits", il); + + ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] + cb(probs, "ffn_moe_probs", il); + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + cb(selected_experts, "ffn_moe_topk", il); + + ggml_tensor * weights = ggml_get_rows(ctx, + ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights", il); + + if (norm_w) { + weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens); + + ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens] + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights_norm", il); + + weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); + } + + cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); + ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); + + ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(gate, "ffn_moe_gate", il); + + switch (type_op) { + case LLM_FFN_SILU: + { + gate = ggml_silu(ctx, gate); + cb(gate, "ffn_moe_silu", il); + } break; + case LLM_FFN_GELU: + { + gate = ggml_gelu(ctx, gate); + cb(gate, "ffn_moe_gelu", il); + } break; + default: + GGML_ASSERT(false); + } + + ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] + cb(par, "ffn_moe_gate_par", il); + + ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + cb(experts, "ffn_moe_down", il); + + experts = ggml_mul(ctx, experts, weights); + + // aggregate experts + ggml_tensor * moe_out = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, + experts->nb[2], i*experts->nb[1]); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx, moe_out, cur_expert); + } + } + + if (n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx, moe_out); + } + + return moe_out; +} + // if max_alibi_bias > 0 then apply ALiBi static struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, @@ -6642,7 +6743,15 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, true, il); + cur = llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + cb, il); + cb(cur, "ffn_moe_out", il); } cur = ggml_add(ctx0, cur, ffn_inp); @@ -6674,80 +6783,6 @@ struct llm_build_context { return gf; } - // REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505 - ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, bool norm_w, int il) { - ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] - cb(probs, "ffn_moe_probs", il); - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); - cb(weights, "ffn_moe_weights", il); - - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] - - if (norm_w) { - ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] - cb(weights, "ffn_moe_weights_norm", il); - } - - // compute expert outputs - ggml_tensor * moe_out = nullptr; - - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert; - - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); - cb(cur_up, "ffn_moe_up", il); - - ggml_tensor * gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); - cb(gate, "ffn_moe_gate", il); - - switch (type_op) { - case LLM_FFN_SILU: - { - gate = ggml_silu(ctx0, gate); - cb(gate, "ffn_moe_silu", il); - } break; - case LLM_FFN_GELU: - { - gate = ggml_gelu(ctx0, gate); - cb(gate, "ffn_moe_gelu", il); - } break; - default: - GGML_ASSERT(false); - } - - cur_expert = ggml_mul(ctx0, cur_up, gate); - cb(cur_expert, "ffn_moe_gate_par", il); - - cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] - cb(cur_expert, "ffn_moe_down", il); - - cur_expert = ggml_mul(ctx0, cur_expert, - ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); - cb(cur_expert, "ffn_moe_weighted", il); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx0, moe_out, cur_expert); - cb(moe_out, "ffn_moe_out", il); - } - } - - return moe_out; - } - struct ggml_cgraph * build_baichuan() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -7195,7 +7230,15 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = build_moe_ffn(cur, n_tokens, LLM_FFN_GELU, true, il); + cur = llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_GELU, true, + cb, il); + cb(cur, "ffn_moe_out", il); // Grok // if layer_out_norm is present then apply it before adding the input @@ -7207,7 +7250,6 @@ struct llm_build_context { cb(cur, "layer_out_norm", il); } - cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -7331,7 +7373,15 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_out_norm", il); - cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, true, il); + cur = llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + cb, il); + cb(cur, "ffn_moe_out", il); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -8502,12 +8552,6 @@ struct llm_build_context { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(gf, Qcur); - ggml_build_forward_expand(gf, Kcur); - ggml_build_forward_expand(gf, Vcur); - Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, @@ -8658,7 +8702,16 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - ggml_tensor * moe_out = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, false, il); + ggml_tensor * moe_out = + llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + cb, il); + cb(cur, "ffn_moe_out", il); // FFN shared expert { |