cuda : fix tensor size calculation for non-split buffer (#5145)

author: slaren <slarengh@gmail.com> 2024-01-26 18:59:43 +0100
committer: GitHub <noreply@github.com> 2024-01-26 18:59:43 +0100
commit: 62fead3ea0a30c8d424f4a8373fa14165c7c707f (patch)
tree: f3ef9ffe08008df108a099fc5828646c052e2a67 /ggml-cuda.cu
parent: 15b4538ff29b280a395a1406d711497d8eaa2564 (diff)
1 files changed, 5 insertions, 14 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 05e5d18a..0d599e20 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
     // TODO: mmq/mmv support
 #endif
 
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
+    const size_t nb11 = src1->nb[1];
+    const size_t nb1  =  dst->nb[1];
 
     const struct ggml_tensor * ids = src0;
     const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
 
     if (ggml_is_quantized(tensor->type)) {
         // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t original_size = ggml_nbytes(tensor);
         size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
         if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
+            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
         }
     }
 }
@@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
 }
 
 GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-
+    size_t size = ggml_nbytes(tensor);
     int64_t ne0 = tensor->ne[0];
 
     if (ggml_is_quantized(tensor->type)) {
author	slaren <slarengh@gmail.com>	2024-01-26 18:59:43 +0100
committer	GitHub <noreply@github.com>	2024-01-26 18:59:43 +0100
commit	62fead3ea0a30c8d424f4a8373fa14165c7c707f (patch)
tree	f3ef9ffe08008df108a099fc5828646c052e2a67 /ggml-cuda.cu
parent	15b4538ff29b280a395a1406d711497d8eaa2564 (diff)