From 8669c3db2b98f05775292778dd05f424ee0cd250 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 12 May 2025 07:47:46 +0300 Subject: GPU offload policy (#405) * Adding GPU offload policy * Minor --------- Co-authored-by: Iwan Kawrakow --- src/llama.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'src') diff --git a/src/llama.cpp b/src/llama.cpp index d0f76c49..38a2b299 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19980,6 +19980,7 @@ struct llama_context_params llama_context_default_params() { /*.thtesh_experts =*/ 0.0f, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, + /*.offload_policy =*/ nullptr, }; return result; @@ -20574,6 +20575,19 @@ struct llama_context * llama_new_context_with_model( } } + if (params.offload_policy) { + const std::vector>& policy = *(const std::vector>*)params.offload_policy; + for (auto [op, on_off] : policy) { + if (op < 0 || op >= int(GGML_OP_COUNT)) { + LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for all ops to %s\n", on_off ? "ON" : "OFF"); + } else { + LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for op %s to %s\n", + ggml_op_name(ggml_op(op)), on_off ? "ON" : "OFF"); + } + ggml_backend_sched_set_op_offload(ctx->sched, ggml_op(op), on_off); + } + } + return ctx; } @@ -23222,3 +23236,10 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * fputs(text, stderr); fflush(stderr); } + +void llama_set_offload_policy(struct llama_context * lctx, int op, bool on_or_off) { + if (!lctx || !lctx->sched) return; + const char * op_name = op < 0 || op >= int(GGML_OP_COUNT) ? "all ops" : ggml_op_name(ggml_op(op)); + printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXX offload(%s) = %d\n", op_name, on_or_off); + ggml_backend_sched_set_op_offload(lctx->sched, ggml_op(op), on_or_off); +} -- cgit v1.2.3