Bug fixes from mainline (#439)

* Add __syncthreads() to the new FA kernel * Clearing padding --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2025-05-20 17:03:14 +0300
committer: GitHub <noreply@github.com> 2025-05-20 17:03:14 +0300
commit: a2b5057a0c9a2758830b6f841bb22150d2511bb1 (patch)
tree: 9b8fbd96c20d55386ec7e4e7681a03f2111d3fc1 /ggml/src/ggml-cuda.cu
parent: 2ec2229f2e9847d4e96bd7f163201810c8f8299a (diff)
1 files changed, 8 insertions, 2 deletions
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index cf17cbc1..9c8c91f4 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2101,13 +2101,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
 
+    // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
+    // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
+    // Therefore, in such cases use cuBLAS.
+    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
+        && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
+
     bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
-    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
+    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type) && !bad_padding_clear
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool              use_mul_mat_q =  ggml_is_quantized(src0->type)
+    bool              use_mul_mat_q =  ggml_is_quantized(src0->type) && !bad_padding_clear
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
 
     // if mmvq is available it's a better choice than dmmv:
author	Kawrakow <iwankawrakow@gmail.com>	2025-05-20 17:03:14 +0300
committer	GitHub <noreply@github.com>	2025-05-20 17:03:14 +0300
commit	a2b5057a0c9a2758830b6f841bb22150d2511bb1 (patch)
tree	9b8fbd96c20d55386ec7e4e7681a03f2111d3fc1 /ggml/src/ggml-cuda.cu
parent	2ec2229f2e9847d4e96bd7f163201810c8f8299a (diff)