GPU offload policy (#405)

* Adding GPU offload policy * Minor --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2025-05-12 07:47:46 +0300
committer: GitHub <noreply@github.com> 2025-05-12 07:47:46 +0300
commit: 8669c3db2b98f05775292778dd05f424ee0cd250 (patch)
tree: ed5c6a41e81ecd6b6620b748bfd765997663eb4c /ggml/src
parent: 504fb890d90ec27e5f4822b7bd772fa94d4d6aac (diff)
2 files changed, 32 insertions, 2 deletions
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index fd538f50..410ab9e5 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -1104,9 +1104,34 @@ struct ggml_backend_sched {
     char * context_buffer;
     size_t context_buffer_size;
 
+    uint32_t op_offload[(GGML_OP_COUNT + 31)/32];
+
     bool debug;
 };
 
+void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off) {
+    int int_op = (int)op;
+    if (!sched) return;
+    if (int_op < 0 || int_op >= (int)GGML_OP_COUNT) {
+        uint32_t mask = on_or_off ? 0xffffffff : 0;
+        for (int i = 0; i < (GGML_OP_COUNT + 31)/32; ++i) sched->op_offload[i] = mask;
+        return;
+    }
+    int i = int_op >> 5;
+    int j = int_op & 31;
+    if (on_or_off) {
+        sched->op_offload[i] |= (1u << j);
+    } else {
+        sched->op_offload[i] &= (~(1u << j));
+    }
+}
+
+static inline bool ggml_backend_sched_offload_enabled(ggml_backend_sched_t sched, enum ggml_op op) {
+    int int_op = (int)op;
+    if (!sched || op < 0 || op >= GGML_OP_COUNT) return false;
+    return sched->op_offload[int_op >> 5] & (1u << (int_op & 31));
+}
+
 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
 #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
 #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
@@ -1181,6 +1206,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
     }
 
     // operations with weights are preferably run on the same backend as the weights
+    bool offload_enabled = ggml_backend_sched_offload_enabled(sched, tensor->op);
     for (int i = 0; i < GGML_MAX_SRC; i++) {
         const struct ggml_tensor * src = tensor->src[i];
         if (src == NULL) {
@@ -1189,7 +1215,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
             // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1) {
+            if (offload_enabled && src_backend_id == sched->n_backends - 1) {
                 for (int b = 0; b < src_backend_id; b++) {
                     if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                         SET_CAUSE(tensor, "1.off");
@@ -1888,6 +1914,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
 
     struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
 
+    for (int i = 0; i < (GGML_OP_COUNT + 31)/32; ++i) sched->op_offload[i] = 0xffffffff;
+
     sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
     sched->n_backends = n_backends;
     sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 87f80d0c..ef73ee7d 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3391,6 +3391,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                 struct ggml_tensor * a = op->src[0];
                 struct ggml_tensor * b = op->op == GGML_OP_MOE_FUSED_UP_GATE ? op->src[2] : op->src[1];
                 if (op->op == GGML_OP_MOE_FUSED_UP_GATE && a->type != op->src[1]->type) {
+                    printf("%s: returning false for GGML_OP_MOE_FUSED_UP_GATE because src0->type != src1->type\n", __func__);
                     return false;
                 }
                 //==================================================================
@@ -3399,6 +3400,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                 //}
                 //==================================================================
                 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16 && !ggml_is_quantized(a->type)) {
+                    printf("%s: returning false for op %d because (case 1)\n", __func__, (int)op->op);
                     return false;
                 }
                 if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
@@ -3621,7 +3623,7 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
     const int min_batch_size = 32;
 
     return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+           (op->ne[2] >= min_batch_size && (op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_MOE_FUSED_UP_GATE));
 
     GGML_UNUSED(backend);
 }
author	Kawrakow <iwankawrakow@gmail.com>	2025-05-12 07:47:46 +0300
committer	GitHub <noreply@github.com>	2025-05-12 07:47:46 +0300
commit	8669c3db2b98f05775292778dd05f424ee0cd250 (patch)
tree	ed5c6a41e81ecd6b6620b748bfd765997663eb4c /ggml/src
parent	504fb890d90ec27e5f4822b7bd772fa94d4d6aac (diff)