diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-05-12 07:47:46 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-05-12 07:47:46 +0300 |
commit | 8669c3db2b98f05775292778dd05f424ee0cd250 (patch) | |
tree | ed5c6a41e81ecd6b6620b748bfd765997663eb4c /ggml/src/ggml-backend.c | |
parent | 504fb890d90ec27e5f4822b7bd772fa94d4d6aac (diff) |
GPU offload policy (#405)
* Adding GPU offload policy
* Minor
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-backend.c')
-rw-r--r-- | ggml/src/ggml-backend.c | 30 |
1 files changed, 29 insertions, 1 deletions
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index fd538f50..410ab9e5 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -1104,9 +1104,34 @@ struct ggml_backend_sched { char * context_buffer; size_t context_buffer_size; + uint32_t op_offload[(GGML_OP_COUNT + 31)/32]; + bool debug; }; +void ggml_backend_sched_set_op_offload(ggml_backend_sched_t sched, enum ggml_op op, bool on_or_off) { + int int_op = (int)op; + if (!sched) return; + if (int_op < 0 || int_op >= (int)GGML_OP_COUNT) { + uint32_t mask = on_or_off ? 0xffffffff : 0; + for (int i = 0; i < (GGML_OP_COUNT + 31)/32; ++i) sched->op_offload[i] = mask; + return; + } + int i = int_op >> 5; + int j = int_op & 31; + if (on_or_off) { + sched->op_offload[i] |= (1u << j); + } else { + sched->op_offload[i] &= (~(1u << j)); + } +} + +static inline bool ggml_backend_sched_offload_enabled(ggml_backend_sched_t sched, enum ggml_op op) { + int int_op = (int)op; + if (!sched || op < 0 || op >= GGML_OP_COUNT) return false; + return sched->op_offload[int_op >> 5] & (1u << (int_op & 31)); +} + #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)] #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)] @@ -1181,6 +1206,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st } // operations with weights are preferably run on the same backend as the weights + bool offload_enabled = ggml_backend_sched_offload_enabled(sched, tensor->op); for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; if (src == NULL) { @@ -1189,7 +1215,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (src_backend_id == sched->n_backends - 1) { + if (offload_enabled && src_backend_id == sched->n_backends - 1) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -1888,6 +1914,8 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); + for (int i = 0; i < (GGML_OP_COUNT + 31)/32; ++i) sched->op_offload[i] = 0xffffffff; + sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; sched->n_backends = n_backends; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; |