diff options
author | agray3 <agray3@users.noreply.github.com> | 2024-10-20 07:36:16 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-20 08:36:16 +0200 |
commit | f2d315b46f7aacc7df4b86bd8acba387b30e11ca (patch) | |
tree | 05df4e87b268b03a68f18907c9ee7492f9159518 /src | |
parent | afbf2ef3e263e5bee3150cf9c422039aa406f10d (diff) |
Avoid rebuild of GGML graph for each token (#98)
Introduces caching of GGML graph to avoid unnecessary full rebuild between each token.
KV cache parameters, which change with each token, are updated directly in cached GGML
graph. Can be disabled with GGML_DISABLE_GRAPH_CACHING environment variable.
Diffstat (limited to 'src')
-rw-r--r-- | src/llama.cpp | 116 |
1 files changed, 111 insertions, 5 deletions
diff --git a/src/llama.cpp b/src/llama.cpp index c950a46d..c5df16e3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8,6 +8,7 @@ #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" +#include "../ggml/src/ggml-impl.h" #ifdef GGML_USE_RPC # include "ggml-rpc.h" @@ -2659,6 +2660,17 @@ struct llama_model { } }; +// Object used to allow caching of GGML graph between tokens where possible. +struct ggml_cached_graph { + bool is_active = false; + ggml_cgraph * gf; + size_t n; + ggml_backend_t backend_res; + ggml_backend_t backend_embd; + struct ggml_tensor * res; + struct ggml_tensor * embd; +}; + struct llama_context { llama_context(const llama_model & model) : model(model) @@ -2759,6 +2771,8 @@ struct llama_context { struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + + struct ggml_cached_graph cached_graph; }; struct llama_lora_weight { @@ -14877,11 +14891,44 @@ static int llama_decode_internal( ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false); + ggml_cgraph * gf; + // the output is always the last tensor in the graph + struct ggml_tensor * res; + struct ggml_tensor * embd; + + bool n_has_changed_since_last_token = false; + if(lctx.cached_graph.n != kv_self.n) n_has_changed_since_last_token = true; + lctx.cached_graph.n = kv_self.n; + + // Re-build graph only if graph caching is not possible + if(!ggml_use_cached_graph(lctx.sched) || n_has_changed_since_last_token) { + + gf = llama_build_graph(lctx, u_batch, false); + + // Set whether GGML graph caching is in use within GGML module, based on + // whether caching was activated here during the previous token + ggml_set_cached_graph(lctx.sched,lctx.cached_graph.is_active); + + // Disable future graph caching in presence of env var, + // if there are multiple devices, if batch size is greater than 1, + // or if nsplits is not 2. + // TO DO enable graph caching for these cases + bool disable_cached_ggml_graph = (getenv("GGML_DISABLE_GRAPH_CACHING") != nullptr) + || (llama_get_device_count(model) > 1) + || (ggml_backend_sched_get_n_splits(lctx.sched) != 2); + for (int i = 0 ; i < gf->n_nodes; i++) { + if (gf->nodes[i]->op == GGML_OP_ADD && gf->nodes[i]->src[1] && gf->nodes[i]->src[1]->ne[1] > 1) { + disable_cached_ggml_graph = true; + break; + } + } + + // Set whether graph caching should be used for future tokens + lctx.cached_graph.is_active=!disable_cached_ggml_graph; // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; + res = gf->nodes[gf->n_nodes - 1]; + embd = gf->nodes[gf->n_nodes - 2]; if (lctx.n_outputs == 0) { // no output @@ -14901,9 +14948,58 @@ static int llama_decode_internal( embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } + lctx.cached_graph.res = res; + lctx.cached_graph.embd = embd; // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); ggml_backend_sched_alloc_graph(lctx.sched, gf); + } + else { + gf = lctx.cached_graph.gf; + res = lctx.cached_graph.res; + embd = lctx.cached_graph.embd; + } + lctx.cached_graph.gf = gf; + + // Update K and V cache parameters in cached graph. + if(gf != nullptr && gf->nodes != nullptr && ggml_use_cached_graph(lctx.sched)) { + + const struct llama_hparams & hparams = model.hparams; + const int64_t kv_head = kv_self.head; + + for (int i = 0; i < gf->n_nodes; i++) { + ggml_tensor * node = gf->nodes[i]; + if (node->op == GGML_OP_CPY) { + + // K cache + const char* k_prefix = "k_cache_view-"; + if (strncmp(node->src[1]->name, k_prefix, strlen(k_prefix)) == 0) { + int il = atoi(node->src[1]->name + strlen(k_prefix)); // Layer index from name + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + ggml_tensor * tmp_tensor = kv_self.k_l[il]; + size_t tmp_offset = (ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa))*kv_head; + node->src[1]->data = static_cast<char*>(tmp_tensor->data) + tmp_offset; + } + + // V cache + const char* v_prefix = "v_cache_view-"; + if (strncmp(node->src[1]->name, v_prefix, strlen(v_prefix)) == 0) { + int il = atoi(node->src[1]->name + strlen(v_prefix)); // Layer index from name + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + ggml_tensor * tmp_tensor = kv_self.v_l[il]; + size_t tmp_offset; + if (cparams.flash_attn) { + tmp_offset = (kv_head)*ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + } else { + tmp_offset = (kv_head)*ggml_element_size(kv_self.v_l[il]); + } + node->src[1]->data = static_cast<char*>(tmp_tensor->data) + tmp_offset; + } + + } + } + + } llama_set_inputs(lctx, u_batch); @@ -14927,12 +15023,18 @@ static int llama_decode_internal( // extract logits if (res) { ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); - GGML_ASSERT(backend_res != nullptr); - GGML_ASSERT(lctx.logits != nullptr); float * logits_out = lctx.logits + n_outputs_prev*n_vocab; const int32_t n_outputs_new = lctx.n_outputs; + if(!ggml_use_cached_graph(lctx.sched)) + lctx.cached_graph.backend_res = backend_res; + else + backend_res = lctx.cached_graph.backend_res; + + GGML_ASSERT(backend_res != nullptr); + GGML_ASSERT(lctx.logits != nullptr); + if (n_outputs_new) { GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size); @@ -14943,6 +15045,10 @@ static int llama_decode_internal( // extract embeddings if (embd) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); + if(!ggml_use_cached_graph(lctx.sched)) + lctx.cached_graph.backend_embd = backend_embd; + else + backend_embd = lctx.cached_graph.backend_embd; GGML_ASSERT(backend_embd != nullptr); switch (cparams.pooling_type) { |