diff options
author | agray3 <agray3@users.noreply.github.com> | 2024-10-20 07:36:16 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-20 08:36:16 +0200 |
commit | f2d315b46f7aacc7df4b86bd8acba387b30e11ca (patch) | |
tree | 05df4e87b268b03a68f18907c9ee7492f9159518 /ggml/src | |
parent | afbf2ef3e263e5bee3150cf9c422039aa406f10d (diff) |
Avoid rebuild of GGML graph for each token (#98)
Introduces caching of GGML graph to avoid unnecessary full rebuild between each token.
KV cache parameters, which change with each token, are updated directly in cached GGML
graph. Can be disabled with GGML_DISABLE_GRAPH_CACHING environment variable.
Diffstat (limited to 'ggml/src')
-rw-r--r-- | ggml/src/ggml-backend.c | 45 |
1 files changed, 37 insertions, 8 deletions
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index e1651cc6..76d37f74 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -1040,6 +1040,13 @@ struct ggml_backend_sched_split { struct ggml_cgraph graph; }; +// Object to facilitate GML graph caching +struct ggml_cached_graph { + bool is_active; + ggml_backend_t input_backend; + struct ggml_tensor * input_cpy[GGML_SCHED_MAX_SPLIT_INPUTS]; +}; + struct ggml_backend_sched { bool is_reset; // true if the scheduler has been reset since the last graph split bool is_alloc; @@ -1085,6 +1092,8 @@ struct ggml_backend_sched { size_t context_buffer_size; bool debug; + + struct ggml_cached_graph cached_graph; }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) @@ -1762,6 +1771,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); + if (!sched->cached_graph.is_active) { + sched->cached_graph.input_backend = input_backend; + sched->cached_graph.input_cpy[j] = input_cpy; + } else { + input_backend = sched->cached_graph.input_backend; + input_cpy = sched->cached_graph.input_cpy[j]; + } + if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done if (sched->events[split_backend_id][sched->cur_copy] != NULL) { @@ -1893,6 +1910,8 @@ ggml_backend_sched_t ggml_backend_sched_new( ggml_backend_sched_reset(sched); + sched->cached_graph.is_active = false; + return sched; } @@ -1969,16 +1988,16 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st } enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - if (!sched->is_reset && !sched->is_alloc) { - ggml_backend_sched_reset(sched); - } - - if (!sched->is_alloc) { - if (!ggml_backend_sched_alloc_graph(sched, graph)) { - return GGML_STATUS_ALLOC_FAILED; + if(!sched->cached_graph.is_active) { + if (!sched->is_reset && !sched->is_alloc) { + ggml_backend_sched_reset(sched); + } + if (!sched->is_alloc) { + if (!ggml_backend_sched_alloc_graph(sched, graph)) { + return GGML_STATUS_ALLOC_FAILED; + } } } - return ggml_backend_sched_compute_splits(sched); } @@ -2243,3 +2262,13 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t return true; } + +bool ggml_use_cached_graph(ggml_backend_sched_t sched) { + return sched->cached_graph.is_active; +} + +void ggml_set_cached_graph(ggml_backend_sched_t sched, bool set_value) { + sched->cached_graph.is_active = set_value; +} + + |