summaryrefslogtreecommitdiff
path: root/ggml/src/ggml-cuda
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2025-05-14 07:29:28 +0300
committerGitHub <noreply@github.com>2025-05-14 07:29:28 +0300
commitb90d6ede2eca3fc48d716868269be5e0e15d00f9 (patch)
tree204dd7312f9c56d00579b0e3e214b8e79c0fadfd /ggml/src/ggml-cuda
parent13740622e973b78ae662bbb785c2fc5926a324eb (diff)
Fix SER (CUDA) (#416)
* Fixing SER bugs * Cleanup * This seems to fix it. * This seems to work --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-cuda')
-rw-r--r--ggml/src/ggml-cuda/mmvq.cu29
1 files changed, 15 insertions, 14 deletions
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index bc26cce4..c6b6ef72 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -150,20 +150,21 @@ static __global__ void mul_mat_vec_q(
char * cdst = (char *)dst + i2*nb2;
int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
if (i02 < 0) {
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
- constexpr int rows_per_cuda_block = 1;
-#else
- constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
- const int row0 = rows_per_cuda_block*blockIdx.x;
- if (threadIdx.y == 0) {
- dst = (float *)cdst;
- for (int j = 0; j < ncols_y; ++j) {
- if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
- dst[j*nrows_dst + row0 + threadIdx.x] = 0;
- }
- }
- }
+ // We clear the buffer via cudaMemset instead
+//#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+// constexpr int rows_per_cuda_block = 1;
+//#else
+// constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
+//#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
+// const int row0 = rows_per_cuda_block*blockIdx.x;
+// if (threadIdx.y == 0) {
+// dst = (float *)cdst;
+// for (int j = 0; j < ncols_y; ++j) {
+// if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
+// dst[j*nrows_dst + row0 + threadIdx.x] = 0;
+// }
+// }
+// }
return;
}
const char * cx = (const char *)vx + i02*nb02;