From dcb2ed48268e421baf25adc00d602dad0f415564 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 4 Jun 2023 08:12:05 +0200
Subject: OpenCL: Fix duplication of layers in VRAM and RAM, add GPU mul kernel
 (#1653)

* Use events instead of clFinish, where possible

* OpenCL: Don't load gpu layers into RAM, add mul_f32 kernel

* Reduce queueing overhead for contiguous tensors by using single mul kernel call

* Adapt to #1612 cl_mem malloc changes

* Reduce code duplication between cuda and opencl branches

* Improve implementation
---
 ggml.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'ggml.c')

diff --git a/ggml.c b/ggml.c
index 4cd0d721..91552c94 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8134,6 +8134,13 @@ static void ggml_compute_forward_mul_f32(
         }
         return;
     }
+#elif defined(GGML_USE_CLBLAST)
+    if (src1->backend == GGML_BACKEND_CL) {
+        if (ith == 0) {
+            ggml_cl_mul(src0, src1, dst);
+        }
+        return;
+    }
 #endif
 
     const int64_t nr = ggml_nrows(src0);
-- 
cgit v1.2.3