diff options
author | Shouzheng Liu <lshzh.hi@gmail.com> | 2023-08-22 02:18:40 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-22 09:18:40 +0300 |
commit | 14b1d7e6f720dee41ce5a826376df738096d9033 (patch) | |
tree | 79499b8acade548547c21298889485d0c1764634 | |
parent | 226255b44ef2c2794bfac48d101d35a9c2dbb965 (diff) |
metal : add missing barriers for mul-mat (#2699)
-rw-r--r-- | ggml-metal.metal | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/ggml-metal.metal b/ggml-metal.metal index 88d48f6c..ce3541f4 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0, //load data and store to threadgroup memory half4x4 temp_a; dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); #pragma unroll(16) for (int i = 0; i < 16; i++) { *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ @@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0, } } else { // block is smaller than 64x32, we should avoid writing data outside of the matrix + threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; for (int i = 0; i < 8; i++) { - threadgroup_barrier(mem_flags::mem_device); simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); } - threadgroup_barrier(mem_flags::mem_device); + threadgroup_barrier(mem_flags::mem_threadgroup); device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; if (sgitg==0) { for (int i = 0; i < n_rows; i++) { |