metal : utilize max shared memory for mul_mat_id (#7935)

author: Georgi Gerganov <ggerganov@gmail.com> 2024-06-14 17:14:09 +0300
committer: GitHub <noreply@github.com> 2024-06-14 17:14:09 +0300
commit: 66ef1ceedf983773c8ceb4d925285d41d4e50e2a (patch)
tree: 5730d00a2043129897aecdc1a9e1649a625f1349
parent: e65bbf606c61f49dc06c7ac060cd5ba7ae446025 (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/ggml-metal.m b/ggml-metal.m
index ec9e9530..f894274c 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1862,9 +1862,10 @@ static enum ggml_status ggml_metal_graph_compute(
                         // ne21 = n_rows
                         const int dst_rows = ne20*ne21;
                         const int dst_rows_min = n_as;
+                        const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
 
                         // max size of the rowids array in the kernel shared buffer
-                        GGML_ASSERT(dst_rows <= 2048);
+                        GGML_ASSERT(dst_rows <= dst_rows_max);
 
                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
author	Georgi Gerganov <ggerganov@gmail.com>	2024-06-14 17:14:09 +0300
committer	GitHub <noreply@github.com>	2024-06-14 17:14:09 +0300
commit	66ef1ceedf983773c8ceb4d925285d41d4e50e2a (patch)
tree	5730d00a2043129897aecdc1a9e1649a625f1349
parent	e65bbf606c61f49dc06c7ac060cd5ba7ae446025 (diff)