diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-07-03 18:31:48 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-03 18:31:48 +0200 |
commit | 3e024de1dae45d17110a7dfe02cadea2eb111f51 (patch) | |
tree | 165e0c366aa64f7de40c5cbf013b8a7674b19e7f | |
parent | 8a0c38f496f60bb0f627521823de604ce10fdc16 (diff) |
Vulkan: Disable multi-add for now (#581)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
-rw-r--r-- | ggml/src/ggml-vulkan.cpp | 44 | ||||
-rw-r--r-- | src/llama.cpp | 48 |
2 files changed, 63 insertions, 29 deletions
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 7c25f3b5..e8b5403c 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -5520,6 +5520,9 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t nei0 = ids->ne[0]; const uint64_t nei1 = ids->ne[1]; + if (nei0*nei1 > 4096) { + fprintf(stderr, "%s: nei0 = %d, nei1 = %d\n", __func__, (int)nei0, (int)nei1); + } GGML_ASSERT(nei0 * nei1 <= 4096); const uint32_t nbi1 = ids->nb[1]; @@ -5915,7 +5918,30 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); } else { - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); + // Split based on number of ids, to fit in shared memory + const uint32_t nei0 = (uint32_t)src2->ne[0]; + const uint32_t nei1 = (uint32_t)src2->ne[1]; + + GGML_ASSERT(nei0 <= 4096); + const uint32_t split_size = std::min(nei1, 4096u / nei0); + + ggml_tensor src1_copy = *src1; + ggml_tensor src2_copy = *src2; + ggml_tensor dst_copy = *dst; + + for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) { + const uint32_t n_tokens = std::min(split_size, nei1 - token_start); + + src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2]; + src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1]; + dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2]; + + src1_copy.ne[2] = n_tokens; + src2_copy.ne[1] = n_tokens; + dst_copy.ne[2] = n_tokens; + + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun); + } } } @@ -9510,9 +9536,15 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_type src0_type = op->src[0]->type; ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; const vk_device& device = ctx->device; - if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) { - // If there's not enough shared memory for row_ids and the result tile, fallback to CPU - return false; + if (op->op == GGML_OP_MUL_MAT_ID) { + if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) { + // If there's not enough shared memory for row_ids and the result tile, fallback to CPU + return false; + } + // Check against size of shared memory variable + if (op->src[2]->ne[0] > 4096) { + return false; + } } switch (src0_type) { case GGML_TYPE_F32: @@ -9580,6 +9612,10 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const default: return false; } + if (op->src[1]->ne[0] != op->src[2]->ne[0]) { + // different head sizes of K and V are not supported yet + return false; + } if (op->src[0]->type != GGML_TYPE_F32) { return false; } diff --git a/src/llama.cpp b/src/llama.cpp index 11a7060c..a1821d2d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9870,6 +9870,28 @@ llm_expert_gating_func_type gating_op, cb(cur, "ffn_moe_weighted", il); } +#ifdef GGML_USE_VULKAN + // aggregate experts + ggml_tensor * moe_out = nullptr; + //ggml_tensor * first_expert = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, + experts->nb[2], i*experts->nb[1]); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx, moe_out, cur_expert); + } + } + + if (n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx, moe_out); + } + + return moe_out; +#else if (n_expert_used == 1) { return ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0)); } @@ -9878,32 +9900,8 @@ llm_expert_gating_func_type gating_op, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], experts->nb[1])); } return ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used); +#endif - //// aggregate experts - //ggml_tensor * moe_out = nullptr; - ////ggml_tensor * first_expert = nullptr; - //for (int i = 0; i < n_expert_used; ++i) { - // ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, - // experts->nb[2], i*experts->nb[1]); - - // if (i == 0) { - // moe_out = cur_expert; - // //first_expert = cur_expert; - // //printf("%s: %d: %d x %d x %d x %d | %d x %d x %d x %d\n", __func__, ggml_is_contiguous(first_expert), - // // (int)cur_expert->ne[0], (int)cur_expert->ne[1], (int)cur_expert->ne[2], (int)cur_expert->ne[3], - // // (int)cur_expert->nb[0], (int)cur_expert->nb[1], (int)cur_expert->nb[2], (int)cur_expert->nb[3]); - // } else { - // moe_out = ggml_add(ctx, moe_out, cur_expert); - // //printf("%s: %d %d\n", __func__, ggml_is_contiguous(cur_expert), ggml_are_same_shape(cur_expert, first_expert)); - // } - //} - - //if (n_expert_used == 1) { - // // avoid returning a non-contiguous tensor - // moe_out = ggml_cont(ctx, moe_out); - //} - - //return moe_out; } static struct ggml_tensor * llm_build_kqv( |