diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-02-27 16:40:49 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-27 16:40:49 +0200 |
commit | b762db7c9264199c2d0f66e7d63e3b4884f3fc0c (patch) | |
tree | 01cc16988a4d21b4c1df367df23f4fd53e6b58a0 /ggml/src | |
parent | 51029edfdf286df76f9268fc87b9514291b2fe42 (diff) |
Option to use MLA without a transposed cache (#235)
The `-mla` command line option turns into an int from a bool.
mla = 0: use standard attention
mla = 1: use MLA with transposed cache
mla > 1: use MLA without transposed cache
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src')
-rw-r--r-- | ggml/src/ggml-cuda/mmvq.cu | 26 |
1 files changed, 4 insertions, 22 deletions
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 2a520d68..e17e77a3 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -158,11 +158,6 @@ static void mul_mat_vec_q_cuda( int64_t nwarps = 1; int64_t rows_per_cuda_block = 1; - //if (ne2 > 1) { - // printf("%s: ncols_x = %d, nrows_x = %d, nrows_y = %d, ncols_y = %d nrows_dst = %d, ne2 = %d nb02 = %zu, nb12 = %zu, nb2 = %zu\n", __func__, - // ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, ne2, nb02, nb12, nb2); - //} - if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2 switch(ncols_y) { case 1: @@ -382,9 +377,8 @@ static void mul_mat_vec_iq3_s_q8_1_cuda( mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, ne2, nb02, nb12, nb2, stream); } -namespace { -void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggml_type type, - const int64_t ne00, const int64_t ne10, const int64_t ne0, const int64_t ne2, +static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggml_type type, + const int64_t ne00, const int64_t ne0, const int64_t ne2, const int64_t nb02, const int64_t nb12, const int64_t nb2, const char * src0_dd_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, @@ -496,7 +490,6 @@ void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggml_type } } -} void ggml_cuda_op_mul_mat_vec_q_3D( ggml_backend_cuda_context & ctx, @@ -505,8 +498,6 @@ void ggml_cuda_op_mul_mat_vec_q_3D( const int64_t src1_padded_row_size, cudaStream_t stream) { const int64_t ne00 = src0->ne[0]; - const int64_t row_diff = row_high - row_low; - const int64_t ne10 = src1->ne[0]; GGML_ASSERT(ne10 % QK8_1 == 0); GGML_ASSERT(src0->ne[3] == 1 && src1->ne[3] == 1 && dst->ne[3] == 1); @@ -516,13 +507,10 @@ void ggml_cuda_op_mul_mat_vec_q_3D( int id = ggml_cuda_get_device(); - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; const int64_t src1_row_size = ggml_row_size(GGML_TYPE_Q8_1, src1_padded_row_size); ggml_cuda_op_mul_mat_vec_q_impl(ctx, src0->type, - ne00, ne10, ne0, dst->ne[2], + ne00, ne0, dst->ne[2], src0->nb[2], src1_row_size, dst->nb[2], src0_dd_i, src1_ddq_i, dst_dd_i, row_low, row_high, src1_ncols, @@ -538,8 +526,6 @@ void ggml_cuda_op_mul_mat_vec_q( const int64_t src1_padded_row_size, cudaStream_t stream) { const int64_t ne00 = src0->ne[0]; - const int64_t row_diff = row_high - row_low; - const int64_t ne10 = src1->ne[0]; GGML_ASSERT(ne10 % QK8_1 == 0); @@ -547,12 +533,8 @@ void ggml_cuda_op_mul_mat_vec_q( int id = ggml_cuda_get_device(); - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - ggml_cuda_op_mul_mat_vec_q_impl(ctx, src0->type, - ne00, ne10, ne0, 1, 0, 0, 0, + ne00, ne0, 1, 0, 0, 0, src0_dd_i, src1_ddq_i, dst_dd_i, row_low, row_high, src1_ncols, src1_padded_row_size, stream); |