CUDA: stream-k decomposition for MMQ (#8018)

* CUDA: stream-k decomposition for MMQ * fix undefined memory reads for small matrices
author: Johannes Gäßler <johannesg@5d6.de> 2024-06-20 14:39:21 +0200
committer: GitHub <noreply@github.com> 2024-06-20 14:39:21 +0200
commit: d50f8897a797a5a03f31228d1b5a7b8130ee1bc2 (patch)
tree: 9ee91b29378e35ff8f7b5071308c12d429f316f0 /ggml-cuda.cu
parent: 2075a66a96cc1b04eabec7cf4b3051193d6f719e (diff)
1 files changed, 1 insertions, 1 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index b8298ab2..f914efd7 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
         }
 
         const int cc = ggml_cuda_info().devices[id].cc;
-        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
+        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
     }
     return row_rounding;
 }
author	Johannes Gäßler <johannesg@5d6.de>	2024-06-20 14:39:21 +0200
committer	GitHub <noreply@github.com>	2024-06-20 14:39:21 +0200
commit	d50f8897a797a5a03f31228d1b5a7b8130ee1bc2 (patch)
tree	9ee91b29378e35ff8f7b5071308c12d429f316f0 /ggml-cuda.cu
parent	2075a66a96cc1b04eabec7cf4b3051193d6f719e (diff)