diff options
| author | Kawrakow <iwankawrakow@gmail.com> | 2025-05-14 07:29:28 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-05-14 07:29:28 +0300 |
| commit | b90d6ede2eca3fc48d716868269be5e0e15d00f9 (patch) | |
| tree | 204dd7312f9c56d00579b0e3e214b8e79c0fadfd /ggml/src/ggml-cuda | |
| parent | 13740622e973b78ae662bbb785c2fc5926a324eb (diff) | |
Fix SER (CUDA) (#416)
* Fixing SER bugs
* Cleanup
* This seems to fix it.
* This seems to work
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-cuda')
| -rw-r--r-- | ggml/src/ggml-cuda/mmvq.cu | 29 |
1 files changed, 15 insertions, 14 deletions
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index bc26cce4..c6b6ef72 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -150,20 +150,21 @@ static __global__ void mul_mat_vec_q( char * cdst = (char *)dst + i2*nb2; int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2; if (i02 < 0) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) - constexpr int rows_per_cuda_block = 1; -#else - constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) - const int row0 = rows_per_cuda_block*blockIdx.x; - if (threadIdx.y == 0) { - dst = (float *)cdst; - for (int j = 0; j < ncols_y; ++j) { - if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) { - dst[j*nrows_dst + row0 + threadIdx.x] = 0; - } - } - } + // We clear the buffer via cudaMemset instead +//#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) +// constexpr int rows_per_cuda_block = 1; +//#else +// constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; +//#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) +// const int row0 = rows_per_cuda_block*blockIdx.x; +// if (threadIdx.y == 0) { +// dst = (float *)cdst; +// for (int j = 0; j < ncols_y; ++j) { +// if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) { +// dst[j*nrows_dst + row0 + threadIdx.x] = 0; +// } +// } +// } return; } const char * cx = (const char *)vx + i02*nb02; |
