From a48e16324770bb829406d06e11be1df0c8a3b517 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 10 Mar 2025 16:19:09 +0200 Subject: DeepSeek imatrix stuff (#250) * This gives us ~20% TG speedup for DeepSeek on CUDA * Slightly better * Also do it for plain (not fused) mul_mat_id * Guard against numerical precision issues for MLA on CUDA * imatrix: wv_b <-> wkv_b --------- Co-authored-by: Iwan Kawrakow --- ggml/src/ggml-cuda/cpy.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ggml/src/ggml-cuda') diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 0b269a86..fabe8843 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -556,7 +556,7 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) { return (void*) cpy_f32_q; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - return (void*) cpy_f32_f16; + return (void*) cpy_f32_f16; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { return (void*) cpy_f32_f16; } else { -- cgit v1.2.3