Avoid rebuild of GGML graph for each token (#98)

Introduces caching of GGML graph to avoid unnecessary full rebuild between each token. KV cache parameters, which change with each token, are updated directly in cached GGML graph. Can be disabled with GGML_DISABLE_GRAPH_CACHING environment variable.
author: agray3 <agray3@users.noreply.github.com> 2024-10-20 07:36:16 +0100
committer: GitHub <noreply@github.com> 2024-10-20 08:36:16 +0200
commit: f2d315b46f7aacc7df4b86bd8acba387b30e11ca (patch)
tree: 05df4e87b268b03a68f18907c9ee7492f9159518 /src
parent: afbf2ef3e263e5bee3150cf9c422039aa406f10d (diff)
1 files changed, 111 insertions, 5 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index c950a46d..c5df16e3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8,6 +8,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "../ggml/src/ggml-impl.h"
 
 #ifdef GGML_USE_RPC
 #  include "ggml-rpc.h"
@@ -2659,6 +2660,17 @@ struct llama_model {
     }
 };
 
+// Object used to allow caching of GGML graph between tokens where possible.
+struct ggml_cached_graph {
+    bool is_active = false;
+    ggml_cgraph * gf;
+    size_t n;
+    ggml_backend_t backend_res;
+    ggml_backend_t backend_embd;
+    struct ggml_tensor * res;
+    struct ggml_tensor * embd;
+};
+
 struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
@@ -2759,6 +2771,8 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+
+    struct ggml_cached_graph cached_graph;
 };
 
 struct llama_lora_weight {
@@ -14877,11 +14891,44 @@ static int llama_decode_internal(
         ggml_backend_sched_reset(lctx.sched);
         ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
-        ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false);
+        ggml_cgraph * gf;
+        // the output is always the last tensor in the graph
+        struct ggml_tensor * res;
+        struct ggml_tensor * embd;
+
+        bool n_has_changed_since_last_token = false;
+        if(lctx.cached_graph.n != kv_self.n) n_has_changed_since_last_token = true;
+        lctx.cached_graph.n = kv_self.n;
+
+        // Re-build graph only if graph caching is not possible
+        if(!ggml_use_cached_graph(lctx.sched) || n_has_changed_since_last_token) {
+
+        gf = llama_build_graph(lctx, u_batch, false);
+
+        // Set whether GGML graph caching is in use within GGML module, based on
+        // whether caching was activated here during the previous token
+        ggml_set_cached_graph(lctx.sched,lctx.cached_graph.is_active);
+
+        // Disable future graph caching in presence of env var,
+        // if there are multiple devices, if batch size is greater than 1,
+        // or if nsplits is not 2.
+        // TO DO enable graph caching for these cases
+        bool disable_cached_ggml_graph = (getenv("GGML_DISABLE_GRAPH_CACHING") != nullptr)
+            || (llama_get_device_count(model) > 1)
+            || (ggml_backend_sched_get_n_splits(lctx.sched) != 2);
+        for (int i = 0 ; i < gf->n_nodes; i++) {
+            if (gf->nodes[i]->op == GGML_OP_ADD && gf->nodes[i]->src[1] && gf->nodes[i]->src[1]->ne[1] > 1) {
+                disable_cached_ggml_graph = true;
+                break;
+            }
+        }
+
+        // Set whether graph caching should be used for future tokens
+        lctx.cached_graph.is_active=!disable_cached_ggml_graph;
 
         // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-        struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
+        res  = gf->nodes[gf->n_nodes - 1];
+        embd = gf->nodes[gf->n_nodes - 2];
 
         if (lctx.n_outputs == 0) {
             // no output
@@ -14901,9 +14948,58 @@ static int llama_decode_internal(
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
+        lctx.cached_graph.res = res;
+        lctx.cached_graph.embd = embd;
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
+        }
+        else {
+            gf = lctx.cached_graph.gf;
+            res = lctx.cached_graph.res;
+            embd = lctx.cached_graph.embd;
+        }
+        lctx.cached_graph.gf = gf;
+
+        // Update K and V cache parameters in cached graph.
+        if(gf != nullptr && gf->nodes != nullptr && ggml_use_cached_graph(lctx.sched)) {
+
+            const struct llama_hparams & hparams = model.hparams;
+            const int64_t kv_head = kv_self.head;
+
+            for (int i = 0; i < gf->n_nodes; i++) {
+                ggml_tensor * node = gf->nodes[i];
+                if (node->op == GGML_OP_CPY) {
+
+                    // K cache
+                    const char* k_prefix = "k_cache_view-";
+                    if (strncmp(node->src[1]->name, k_prefix, strlen(k_prefix)) == 0) {
+                        int il = atoi(node->src[1]->name + strlen(k_prefix)); // Layer index from name
+                        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+                        ggml_tensor * tmp_tensor =  kv_self.k_l[il];
+                        size_t tmp_offset = (ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa))*kv_head;
+                        node->src[1]->data = static_cast<char*>(tmp_tensor->data) + tmp_offset;
+                    }
+
+                    // V cache
+                    const char* v_prefix = "v_cache_view-";
+                    if (strncmp(node->src[1]->name, v_prefix, strlen(v_prefix)) == 0) {
+                        int il = atoi(node->src[1]->name + strlen(v_prefix)); // Layer index from name
+                        const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+                        ggml_tensor * tmp_tensor = kv_self.v_l[il];
+                        size_t tmp_offset;
+                        if (cparams.flash_attn) {
+                            tmp_offset = (kv_head)*ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                        } else {
+                            tmp_offset = (kv_head)*ggml_element_size(kv_self.v_l[il]);
+                        }
+                        node->src[1]->data = static_cast<char*>(tmp_tensor->data) + tmp_offset;
+                    }
+
+                }
+            }
+
+        }
 
         llama_set_inputs(lctx, u_batch);
 
@@ -14927,12 +15023,18 @@ static int llama_decode_internal(
         // extract logits
         if (res) {
             ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
-            GGML_ASSERT(backend_res != nullptr);
-            GGML_ASSERT(lctx.logits != nullptr);
 
             float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
             const int32_t n_outputs_new = lctx.n_outputs;
 
+            if(!ggml_use_cached_graph(lctx.sched))
+                lctx.cached_graph.backend_res = backend_res;
+            else
+                backend_res = lctx.cached_graph.backend_res;
+
+            GGML_ASSERT(backend_res != nullptr);
+            GGML_ASSERT(lctx.logits != nullptr);
+
             if (n_outputs_new) {
                 GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
                 GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
@@ -14943,6 +15045,10 @@ static int llama_decode_internal(
         // extract embeddings
         if (embd) {
             ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
+            if(!ggml_use_cached_graph(lctx.sched))
+                lctx.cached_graph.backend_embd = backend_embd;
+            else
+                backend_embd = lctx.cached_graph.backend_embd;
             GGML_ASSERT(backend_embd != nullptr);
 
             switch (cparams.pooling_type) {
author	agray3 <agray3@users.noreply.github.com>	2024-10-20 07:36:16 +0100
committer	GitHub <noreply@github.com>	2024-10-20 08:36:16 +0200
commit	f2d315b46f7aacc7df4b86bd8acba387b30e11ca (patch)
tree	05df4e87b268b03a68f18907c9ee7492f9159518 /src
parent	afbf2ef3e263e5bee3150cf9c422039aa406f10d (diff)