diff options
| author | Georgi Gerganov <ggerganov@gmail.com> | 2024-05-28 11:04:19 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-05-28 11:04:19 +0300 |
| commit | 0548a4187f2e53b8fc6d9ff0f4c71988f708ff42 (patch) | |
| tree | 35ae0e19ecc36169939620b2702fd853c8e8c116 /ggml-metal.metal | |
| parent | 9335b969e86a222e247adacedf814d8abfff8847 (diff) | |
ggml : generalize GGML_OP_CONCAT (#7563)
* ggml : generalize GGML_OP_CONCAT (WIP)
ggml-ci
* tests : add dim != 2 tests
* metal : generalize concat kernel
* tests : naming
* cuda : generalize concat kernel
ggml-ci
* sycl : add warning and assert
* ggml : fix op params handling
* metal : bugfix kernel
ggml-ci
* ggml : reimplement CPU and Metal
* cuda : add asserts
ggml-ci
* ggml : fix ptrs
ggml-ci
Diffstat (limited to 'ggml-metal.metal')
| -rw-r--r-- | ggml-metal.metal | 29 |
1 files changed, 14 insertions, 15 deletions
diff --git a/ggml-metal.metal b/ggml-metal.metal index 174086b5..b16f2b7e 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -3366,31 +3366,30 @@ kernel void kernel_concat( constant uint64_t & nb1, constant uint64_t & nb2, constant uint64_t & nb3, + constant int32_t & dim, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig.z; - const int64_t i02 = tgpig.y; - const int64_t i01 = tgpig.x; + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); - device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00; - device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10; - device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0; + device const float * x; for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - if (i02 < ne02) { - ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0]; - src0_ptr += ntg.x*nb00; + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (device const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00); } else { - ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0]; - src1_ptr += ntg.x*nb10; + x = (device const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10); } - dst_ptr += ntg.x*nb0; + + device float * y = (device float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + *y = *x; } } |
