From f2d315b46f7aacc7df4b86bd8acba387b30e11ca Mon Sep 17 00:00:00 2001 From: agray3 Date: Sun, 20 Oct 2024 07:36:16 +0100 Subject: Avoid rebuild of GGML graph for each token (#98) Introduces caching of GGML graph to avoid unnecessary full rebuild between each token. KV cache parameters, which change with each token, are updated directly in cached GGML graph. Can be disabled with GGML_DISABLE_GRAPH_CACHING environment variable. --- ggml/include/ggml-backend.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'ggml/include/ggml-backend.h') diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 5f3f1e28..621620bc 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -232,6 +232,12 @@ extern "C" { GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); + // Utility to query whether cached GGML graph is in use + GGML_API bool ggml_use_cached_graph(ggml_backend_sched_t sched); + + // Set whether or not to use GGML graph caching + GGML_API void ggml_set_cached_graph(ggml_backend_sched_t sched, bool set_value); + #ifdef __cplusplus } -- cgit v1.2.3