ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id cuda : improve mmid row copy * cuda : fix bin bcast with non-cont src0 * test-backend-ops : only run all mul mat tests for base types * llama : disable moe offloading with SYCL --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: slaren <slarengh@gmail.com> 2024-04-18 15:18:48 +0200
committer: GitHub <noreply@github.com> 2024-04-18 15:18:48 +0200
commit: 0d56246f4b9764158525d894b96606f6163c53a8 (patch)
tree: 43e57dfbbde67b701020fc3e2ac885e846925d26 /ggml.h
parent: 03c0946d73c63ea73e1d85015b7088298443d438 (diff)
1 files changed, 2 insertions, 4 deletions
diff --git a/ggml.h b/ggml.h
index e9ed8eee..4d1d77fe 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1161,13 +1161,11 @@ extern "C" {
             enum ggml_prec       prec);
 
     // indirect matrix multiplication
-    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
     GGML_API struct ggml_tensor * ggml_mul_mat_id(
             struct ggml_context * ctx,
             struct ggml_tensor  * as,
-            struct ggml_tensor  * ids,
-            int                   id,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * ids);
 
     // A: m columns, n rows,
     // B: p columns, n rows,
author	slaren <slarengh@gmail.com>	2024-04-18 15:18:48 +0200
committer	GitHub <noreply@github.com>	2024-04-18 15:18:48 +0200
commit	0d56246f4b9764158525d894b96606f6163c53a8 (patch)
tree	43e57dfbbde67b701020fc3e2ac885e846925d26 /ggml.h
parent	03c0946d73c63ea73e1d85015b7088298443d438 (diff)