Avoid rebuild of GGML graph for each token (#98)

Introduces caching of GGML graph to avoid unnecessary full rebuild between each token. KV cache parameters, which change with each token, are updated directly in cached GGML graph. Can be disabled with GGML_DISABLE_GRAPH_CACHING environment variable.
author: agray3 <agray3@users.noreply.github.com> 2024-10-20 07:36:16 +0100
committer: GitHub <noreply@github.com> 2024-10-20 08:36:16 +0200
commit: f2d315b46f7aacc7df4b86bd8acba387b30e11ca (patch)
tree: 05df4e87b268b03a68f18907c9ee7492f9159518 /ggml/include/ggml.h
parent: afbf2ef3e263e5bee3150cf9c422039aa406f10d (diff)
1 files changed, 7 insertions, 0 deletions
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index a467c297..a99dc6b5 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -597,6 +597,13 @@ extern "C" {
         GGML_TENSOR_FLAG_PARAM  = 4,
     };
 
+    // Flag (used on GGML_OP_CPY nodes) on whether node is associated with K or V cache
+    enum ggml_kv_cache_flag {
+        GGML_KV_CACHE_FLAG_NONE = 0,
+        GGML_KV_CACHE_FLAG_K = 1,
+        GGML_KV_CACHE_FLAG_V = 2
+    };
+
     // ggml object
     struct ggml_object {
         size_t offs;
author	agray3 <agray3@users.noreply.github.com>	2024-10-20 07:36:16 +0100
committer	GitHub <noreply@github.com>	2024-10-20 08:36:16 +0200
commit	f2d315b46f7aacc7df4b86bd8acba387b30e11ca (patch)
tree	05df4e87b268b03a68f18907c9ee7492f9159518 /ggml/include/ggml.h
parent	afbf2ef3e263e5bee3150cf9c422039aa406f10d (diff)