From 5dc9dd7152dedc6046b646855585bd070c91e8c8 Mon Sep 17 00:00:00 2001 From: Carolinabanana <140120812+Carolinabanana@users.noreply.github.com> Date: Tue, 9 Apr 2024 09:16:13 +0100 Subject: llama : add Command R Plus support (#6491) * Add Command R Plus GGUF * Add Command R Plus GGUF * Loading works up to LayerNorm2D * Export new tensors in 1D so they are not quantized. * Fix embedding layer based on Noeda's example * Whitespace * Add line * Fix unexpected tokens on MPS. Re-add F16 fix. ((Noeda) * dranger003: Fix block index overflow in CUDA dequantizing. * Reverted blocked multiplication code as it still has issues and could affect other Llama arches * export norms as f32 * fix overflow issues during quant and other cleanup * Type convention Co-authored-by: Georgi Gerganov * dranger003: Fix more int overflow during quant. --------- Co-authored-by: S Co-authored-by: S Co-authored-by: slaren Co-authored-by: Georgi Gerganov --- ggml-cuda.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'ggml-cuda.cu') diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ce28cb55..bff8ad9d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas( // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = id == ctx.device ? ne0 : row_diff; + int64_t ldc = id == ctx.device ? ne0 : row_diff; const int compute_capability = ggml_cuda_info().devices[id].cc; @@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat( const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + const int64_t nb2 = dst->nb[2]; + const int64_t nb3 = dst->nb[3]; GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer)); GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer)); -- cgit v1.2.3