diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-03-10 16:19:09 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-10 16:19:09 +0200 |
commit | a48e16324770bb829406d06e11be1df0c8a3b517 (patch) | |
tree | 1f0ef5e1fd55c35acac40cca85cadc8606dd0673 /ggml/src | |
parent | 699c9cb7f63dd8431bce91b86e10efb41255f6c1 (diff) |
DeepSeek imatrix stuff (#250)
* This gives us ~20% TG speedup for DeepSeek on CUDA
* Slightly better
* Also do it for plain (not fused) mul_mat_id
* Guard against numerical precision issues for MLA on CUDA
* imatrix: wv_b <-> wkv_b
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src')
-rw-r--r-- | ggml/src/ggml-cuda/cpy.cu | 2 | ||||
-rw-r--r-- | ggml/src/ggml.c | 2 |
2 files changed, 2 insertions, 2 deletions
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 0b269a86..fabe8843 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -556,7 +556,7 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) { return (void*) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - return (void*) cpy_f32_f16<cpy_1_f32_f16>; + return (void*) cpy_f32_f16<cpy_1_f16_f16>; } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { return (void*) cpy_f32_f16<cpy_1_f16_f32>; } else { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4089e9b7..88820438 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -10468,7 +10468,7 @@ static void ggml_compute_forward_dup_bytes( if (ggml_is_contiguous(dst)) { size_t id = 0; char * dst_ptr = (char *) dst->data; - const size_t rs = ne00 * type_size; + const size_t rs = ggml_row_size(src0->type, ne00); //ne00 * type_size; if (nb00 == type_size) { // src0 is contigous on first dimension, copy by rows |