summaryrefslogtreecommitdiff
path: root/llama.h
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-01-17 18:39:41 +0200
committerGitHub <noreply@github.com>2024-01-17 18:39:41 +0200
commit44a1a4a41a4c0b03afaa7d9e06bcbc7cf95aa1e6 (patch)
tree3c0973be05046780e14ca8048b7dbe1372aa5833 /llama.h
parentc918fe8dca8fa1c4602427e0a4b88e20046f6c34 (diff)
backend : add eval callback (#4935)
* backend : add eval callback ggml-ci * backend : group nodes in a single compute when user don't need them * backend : clean-up the implementation ggml-ci * simple : do not perform tensor data copy if not needed * simple : fix * simple : no need for ggml_is_contiguous + fix bool parse * llama : fix callback placement in llama_context_params * backend : avoid double-ask callback calls * simple : restore examples, imatrix will serve as a demo
Diffstat (limited to 'llama.h')
-rw-r--r--llama.h4
1 files changed, 4 insertions, 0 deletions
diff --git a/llama.h b/llama.h
index a570b0d6..e268d7a1 100644
--- a/llama.h
+++ b/llama.h
@@ -2,6 +2,7 @@
#define LLAMA_H
#include "ggml.h"
+#include "ggml-backend.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
@@ -231,6 +232,9 @@ extern "C" {
float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
+
enum ggml_type type_k; // data type for K cache
enum ggml_type type_v; // data type for V cache