diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-05-20 17:03:14 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-05-20 17:03:14 +0300 |
commit | a2b5057a0c9a2758830b6f841bb22150d2511bb1 (patch) | |
tree | 9b8fbd96c20d55386ec7e4e7681a03f2111d3fc1 /ggml/src/ggml-cuda.cu | |
parent | 2ec2229f2e9847d4e96bd7f163201810c8f8299a (diff) |
Bug fixes from mainline (#439)
* Add __syncthreads() to the new FA kernel
* Clearing padding
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-cuda.cu')
-rw-r--r-- | ggml/src/ggml-cuda.cu | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index cf17cbc1..9c8c91f4 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2101,13 +2101,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); + // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q. + // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data. + // Therefore, in such cases use cuBLAS. + const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE + && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src; + bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1; - bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) + bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; - bool use_mul_mat_q = ggml_is_quantized(src0->type) + bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; // if mmvq is available it's a better choice than dmmv: |