From 2bf8d0f7c4cc1235755ad06961ca761e458c5e55 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 18 Mar 2024 11:03:04 +0100
Subject: backend : offload large batches to GPU (#6083)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml-backend-impl.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'ggml-backend-impl.h')

diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
index e475e20e..f121e1de 100644
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -103,6 +103,11 @@ extern "C" {
         // check if the backend supports an operation
         bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
 
+        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
+        // these should be expensive operations with large batch sizes that may benefit from running on this backend
+        // even if the weight has to be copied from the CPU temporarily
+        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+
         // (optional) event synchronization
         ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
         void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
-- 
cgit v1.2.3