summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp223
1 files changed, 138 insertions, 85 deletions
diff --git a/llama.cpp b/llama.cpp
index 05094fec..18e473c0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4495,6 +4495,13 @@ static bool llm_load_tensors(
auto & hparams = model.hparams;
+#ifdef GGML_USE_SYCL
+ // disable MoE with SYCL until mul_mat_id is updated
+ if (hparams.n_expert > 0) {
+ n_gpu_layers = 0;
+ }
+#endif
+
model.split_mode = split_mode;
model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;
@@ -6099,6 +6106,100 @@ static struct ggml_tensor * llm_build_ffn(
return cur;
}
+static struct ggml_tensor * llm_build_moe_ffn(
+ struct ggml_context * ctx,
+ struct ggml_tensor * cur,
+ struct ggml_tensor * gate_inp,
+ struct ggml_tensor * up_exps,
+ struct ggml_tensor * gate_exps,
+ struct ggml_tensor * down_exps,
+ int64_t n_expert,
+ int64_t n_expert_used,
+ llm_ffn_op_type type_op,
+ bool norm_w,
+ const llm_build_cb & cb,
+ int il) {
+ int64_t n_embd = cur->ne[0];
+ int64_t n_tokens = cur->ne[1];
+
+ ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
+ cb(logits, "ffn_moe_logits", il);
+
+ ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
+ cb(probs, "ffn_moe_probs", il);
+
+ // select experts
+ ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
+ cb(selected_experts, "ffn_moe_topk", il);
+
+ ggml_tensor * weights = ggml_get_rows(ctx,
+ ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+ cb(weights, "ffn_moe_weights", il);
+
+ if (norm_w) {
+ weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
+
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
+ cb(weights_sum, "ffn_moe_weights_sum", il);
+
+ weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
+ cb(weights, "ffn_moe_weights_norm", il);
+
+ weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
+ }
+
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
+ ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+ cb(up, "ffn_moe_up", il);
+
+ ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+ cb(gate, "ffn_moe_gate", il);
+
+ switch (type_op) {
+ case LLM_FFN_SILU:
+ {
+ gate = ggml_silu(ctx, gate);
+ cb(gate, "ffn_moe_silu", il);
+ } break;
+ case LLM_FFN_GELU:
+ {
+ gate = ggml_gelu(ctx, gate);
+ cb(gate, "ffn_moe_gelu", il);
+ } break;
+ default:
+ GGML_ASSERT(false);
+ }
+
+ ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
+ cb(par, "ffn_moe_gate_par", il);
+
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+ cb(experts, "ffn_moe_down", il);
+
+ experts = ggml_mul(ctx, experts, weights);
+
+ // aggregate experts
+ ggml_tensor * moe_out = nullptr;
+ for (int i = 0; i < n_expert_used; ++i) {
+ ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
+ experts->nb[2], i*experts->nb[1]);
+
+ if (i == 0) {
+ moe_out = cur_expert;
+ } else {
+ moe_out = ggml_add(ctx, moe_out, cur_expert);
+ }
+ }
+
+ if (n_expert_used == 1) {
+ // avoid returning a non-contiguous tensor
+ moe_out = ggml_cont(ctx, moe_out);
+ }
+
+ return moe_out;
+}
+
// if max_alibi_bias > 0 then apply ALiBi
static struct ggml_tensor * llm_build_kqv(
struct ggml_context * ctx,
@@ -6642,7 +6743,15 @@ struct llm_build_context {
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
- cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, true, il);
+ cur = llm_build_moe_ffn(ctx0, cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ cb, il);
+ cb(cur, "ffn_moe_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
@@ -6674,80 +6783,6 @@ struct llm_build_context {
return gf;
}
- // REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505
- ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, bool norm_w, int il) {
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
- cb(logits, "ffn_moe_logits", il);
-
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
- cb(probs, "ffn_moe_probs", il);
-
- // select experts
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
-
- ggml_tensor * weights = ggml_get_rows(ctx0,
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
- cb(weights, "ffn_moe_weights", il);
-
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
-
- if (norm_w) {
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
- cb(weights_sum, "ffn_moe_weights_sum", il);
-
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
- cb(weights, "ffn_moe_weights_norm", il);
- }
-
- // compute expert outputs
- ggml_tensor * moe_out = nullptr;
-
- for (int i = 0; i < n_expert_used; ++i) {
- ggml_tensor * cur_expert;
-
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
- cb(cur_up, "ffn_moe_up", il);
-
- ggml_tensor * gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
- cb(gate, "ffn_moe_gate", il);
-
- switch (type_op) {
- case LLM_FFN_SILU:
- {
- gate = ggml_silu(ctx0, gate);
- cb(gate, "ffn_moe_silu", il);
- } break;
- case LLM_FFN_GELU:
- {
- gate = ggml_gelu(ctx0, gate);
- cb(gate, "ffn_moe_gelu", il);
- } break;
- default:
- GGML_ASSERT(false);
- }
-
- cur_expert = ggml_mul(ctx0, cur_up, gate);
- cb(cur_expert, "ffn_moe_gate_par", il);
-
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
- cb(cur_expert, "ffn_moe_down", il);
-
- cur_expert = ggml_mul(ctx0, cur_expert,
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
- cb(cur_expert, "ffn_moe_weighted", il);
-
- if (i == 0) {
- moe_out = cur_expert;
- } else {
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
- cb(moe_out, "ffn_moe_out", il);
- }
- }
-
- return moe_out;
- }
-
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -7195,7 +7230,15 @@ struct llm_build_context {
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
- cur = build_moe_ffn(cur, n_tokens, LLM_FFN_GELU, true, il);
+ cur = llm_build_moe_ffn(ctx0, cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ n_expert, n_expert_used,
+ LLM_FFN_GELU, true,
+ cb, il);
+ cb(cur, "ffn_moe_out", il);
// Grok
// if layer_out_norm is present then apply it before adding the input
@@ -7207,7 +7250,6 @@ struct llm_build_context {
cb(cur, "layer_out_norm", il);
}
-
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
@@ -7331,7 +7373,15 @@ struct llm_build_context {
LLM_NORM, cb, il);
cb(cur, "attn_out_norm", il);
- cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, true, il);
+ cur = llm_build_moe_ffn(ctx0, cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ cb, il);
+ cb(cur, "ffn_moe_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
@@ -8502,12 +8552,6 @@ struct llm_build_context {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
- // these nodes are added to the graph together so that they are not reordered
- // by doing so, the number of splits in the graph is reduced
- ggml_build_forward_expand(gf, Qcur);
- ggml_build_forward_expand(gf, Kcur);
- ggml_build_forward_expand(gf, Vcur);
-
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -8658,7 +8702,16 @@ struct llm_build_context {
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
- ggml_tensor * moe_out = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, false, il);
+ ggml_tensor * moe_out =
+ llm_build_moe_ffn(ctx0, cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, false,
+ cb, il);
+ cb(cur, "ffn_moe_out", il);
// FFN shared expert
{