backend : add eval callback (#4935)

* backend : add eval callback ggml-ci * backend : group nodes in a single compute when user don't need them * backend : clean-up the implementation ggml-ci * simple : do not perform tensor data copy if not needed * simple : fix * simple : no need for ggml_is_contiguous + fix bool parse * llama : fix callback placement in llama_context_params * backend : avoid double-ask callback calls * simple : restore examples, imatrix will serve as a demo
author: Georgi Gerganov <ggerganov@gmail.com> 2024-01-17 18:39:41 +0200
committer: GitHub <noreply@github.com> 2024-01-17 18:39:41 +0200
commit: 44a1a4a41a4c0b03afaa7d9e06bcbc7cf95aa1e6 (patch)
tree: 3c0973be05046780e14ca8048b7dbe1372aa5833 /llama.h
parent: c918fe8dca8fa1c4602427e0a4b88e20046f6c34 (diff)
1 files changed, 4 insertions, 0 deletions
diff --git a/llama.h b/llama.h
index a570b0d6..e268d7a1 100644
--- a/llama.h
+++ b/llama.h
@@ -2,6 +2,7 @@
 #define LLAMA_H
 
 #include "ggml.h"
+#include "ggml-backend.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
@@ -231,6 +232,9 @@ extern "C" {
         float    yarn_beta_slow;   // YaRN high correction dim
         uint32_t yarn_orig_ctx;    // YaRN original context size
 
+        ggml_backend_sched_eval_callback cb_eval;
+        void * cb_eval_user_data;
+
         enum ggml_type type_k; // data type for K cache
         enum ggml_type type_v; // data type for V cache
author	Georgi Gerganov <ggerganov@gmail.com>	2024-01-17 18:39:41 +0200
committer	GitHub <noreply@github.com>	2024-01-17 18:39:41 +0200
commit	44a1a4a41a4c0b03afaa7d9e06bcbc7cf95aa1e6 (patch)
tree	3c0973be05046780e14ca8048b7dbe1372aa5833 /llama.h
parent	c918fe8dca8fa1c4602427e0a4b88e20046f6c34 (diff)