diff options
author | slaren <slarengh@gmail.com> | 2024-04-03 15:07:05 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-03 16:07:05 +0300 |
commit | 08a0c0206075556e82aca0feafad530dcc5f1426 (patch) | |
tree | 3937cd263076c548ba25348253dcec6d355b8fef /tests/test-backend-ops.cpp | |
parent | 52604860f93063ef98863921da697576af1c7665 (diff) |
ggml : mul_mat_id use the same tensor for all the experts (#6387)
* ggml : update mul_mat_id to use the same tensor for all the experts
* update cuda
* minor
* update metal
* update test-backend-ops
* fix cuda
* Update ggml-metal.m
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* update convert.py
* update convert-hf-to-gguf.py
* update convert.py for mixtral hf models
* Update convert-hf-to-gguf.py
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* cuda : support non-pow-2 number of experts
* allow quantize to work for split and merged experts models in the same way
* cleanup + disable mmap automatically with split tensors models
* update imatrix
* test-backend-ops : test qwen argsort
* update grok model loading
* llama : add merged experts tensors to the grok tensor map
* minor
* gguf : bump version
* fix quantizing of merged experts
* convert-hf-to-gguf.py : update grok (untested)
* make linter happy
* cuda/argsort : use shared memory instead of pool memory
* convert : fix grok tensor names
* metal : add support for non-pow-2 argsort
* llama : more loader cleanup, better error checking
* cuda : fix warning
* llama : still use mmap for loading old models, but copy the data to a host buffer
* add review note
* llama : remove ffn tensor counting + add sanity check
ggml-ci
* convert : fix handling of n_experts == None
ggml-ci
* imatrix : fix ncall counters
* llama : produce error if imatrix size does not match
* quantize : terminate on errors + trace logs
ggml-ci
* metal : pad shared memory to 16 bytes
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'tests/test-backend-ops.cpp')
-rw-r--r-- | tests/test-backend-ops.cpp | 99 |
1 files changed, 3 insertions, 96 deletions
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 5dfea566..51b3487b 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -979,17 +979,13 @@ struct test_mul_mat_id : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { // C^T = A * B^T: (k, m) * (k, n) => (m, n) - std::vector<ggml_tensor *> mats; - for (int i = 0; i < n_mats; i++) { - ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m); - mats.push_back(a); - } + ggml_tensor * mats = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); if (v) { ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0); } ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n); - ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b); + ggml_tensor * out = ggml_mul_mat_id(ctx, mats, ids, v ? id/2 : id, b); return out; } @@ -1477,91 +1473,6 @@ struct test_leaky_relu : public test_case { } }; -// Mixtral MOE -struct test_moe : public test_case { - const int n_experts; - const int n_experts_per_tok; - const int n_tokens; - const int n_embd; - const int n_ff; - - std::string op_desc(ggml_tensor * t) override { - return "MOE"; - - GGML_UNUSED(t); - } - - std::string vars() override { - return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff); - } - - test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336) - : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) { - } - - ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts); - - std::vector<ggml_tensor *> ffn_up_exp(n_experts); - std::vector<ggml_tensor *> ffn_gate_exp(n_experts); - std::vector<ggml_tensor *> ffn_down_exp(n_experts); - - for (int i = 0; i < n_experts; ++i) { - ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); - } - - ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); - ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f); - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok); - - ggml_tensor * weights = ggml_get_rows(ctx, - ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts); - - weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens); - - ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); - - weights = ggml_div(ctx, weights, weights_sum); - - // compute expert outputs - ggml_tensor * moe_out = nullptr; - - for (int i = 0; i < n_experts_per_tok; ++i) { - ggml_tensor * cur_expert; - - ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur); - - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur); - - cur_gate = ggml_silu(ctx, cur_gate); - - cur_expert = ggml_mul(ctx, cur_up, cur_gate); - - cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert); - - cur_expert = ggml_mul(ctx, cur_expert, - ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx, moe_out, cur_expert); - } - } - - cur = moe_out; - - return cur; - } -}; - - enum llm_norm_type { LLM_NORM, LLM_NORM_RMS, @@ -2169,6 +2080,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); + test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen } test_cases.emplace_back(new test_sum_rows()); @@ -2182,11 +2094,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op // these tests are disabled to save execution time, but they can be handy for debugging #if 0 -#if !defined(__SANITIZE_THREAD__) - // FIXME: these tests use too much memory with thread sanitizer - test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024)); - //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); -#endif test_cases.emplace_back(new test_llama(1)); test_cases.emplace_back(new test_llama(2)); test_cases.emplace_back(new test_falcon(1)); |