From dcb2ed48268e421baf25adc00d602dad0f415564 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 4 Jun 2023 08:12:05 +0200 Subject: OpenCL: Fix duplication of layers in VRAM and RAM, add GPU mul kernel (#1653) * Use events instead of clFinish, where possible * OpenCL: Don't load gpu layers into RAM, add mul_f32 kernel * Reduce queueing overhead for contiguous tensors by using single mul kernel call * Adapt to #1612 cl_mem malloc changes * Reduce code duplication between cuda and opencl branches * Improve implementation --- ggml.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'ggml.c') diff --git a/ggml.c b/ggml.c index 4cd0d721..91552c94 100644 --- a/ggml.c +++ b/ggml.c @@ -8134,6 +8134,13 @@ static void ggml_compute_forward_mul_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (src1->backend == GGML_BACKEND_CL) { + if (ith == 0) { + ggml_cl_mul(src0, src1, dst); + } + return; + } #endif const int64_t nr = ggml_nrows(src0); -- cgit v1.2.3