diff options
author | Johannes Gäßler <johannesg@5d6.de> | 2024-06-05 16:53:00 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-05 16:53:00 +0200 |
commit | 7d1a378b8fb266782d9248538a661405aad80768 (patch) | |
tree | 7ce459a4c5a85e75f75825772124aedc3bb54b7f /ggml-cuda.cu | |
parent | 2b3389677a833cee0880226533a1768b1a9508d2 (diff) |
CUDA: refactor mmq, dmmv, mmvq (#7716)
* CUDA: refactor mmq, dmmv, mmvq
* fix out-of-bounds write
* struct for qk, qr, qi
* fix cmake build
* mmq_type_traits
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r-- | ggml-cuda.cu | 84 |
1 files changed, 9 insertions, 75 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c81c6a0d..dad8a9e2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -633,88 +633,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { // cuda split buffer -static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) { - int64_t min_compute_capability = INT_MAX; - int64_t max_compute_capability = INT_MIN; +static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) { + int64_t row_rounding = 0; for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { - if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) { - if (min_compute_capability > ggml_cuda_info().devices[id].cc) { - min_compute_capability = ggml_cuda_info().devices[id].cc; - } - if (max_compute_capability < ggml_cuda_info().devices[id].cc) { - max_compute_capability = ggml_cuda_info().devices[id].cc; - } + if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) { + continue; } - } -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - switch(type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return max_compute_capability >= CC_RDNA2 ? 128 : 64; - case GGML_TYPE_F16: - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_Q2_K: - return max_compute_capability >= CC_RDNA2 ? 128 : 32; - case GGML_TYPE_Q3_K: - return min_compute_capability < CC_RDNA2 ? 128 : 64; - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - return max_compute_capability >= CC_RDNA2 ? 128 : 64; - default: - GGML_ASSERT(false); - } -#else - switch(type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - return max_compute_capability >= CC_VOLTA ? 128 : 64; - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return 64; - case GGML_TYPE_F16: - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - return max_compute_capability >= CC_VOLTA ? 128 : 64; - case GGML_TYPE_Q6_K: - return 64; - default: - GGML_ASSERT(false); + const int cc = ggml_cuda_info().devices[id].cc; + row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc))); } -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + return row_rounding; } static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) { const int64_t nrows = ggml_nrows(tensor); - const int64_t rounding = get_row_rounding(tensor->type, tensor_split); + const int64_t rounding = get_row_rounding(tensor_split); *row_low = id == 0 ? 0 : nrows*tensor_split[id]; *row_low -= *row_low % rounding; @@ -1499,7 +1433,7 @@ static void ggml_cuda_op_mul_mat( // for multi GPU, get the row boundaries from tensor split // and round to mul_mat_q tile sizes if (split) { - const int64_t rounding = get_row_rounding(src0->type, tensor_split); + const int64_t rounding = get_row_rounding(tensor_split); if (id != 0) { dev[id].row_low = ne01*tensor_split[id]; |