From e986f94829bae0b9e66b326acbbba179931c84f1 Mon Sep 17 00:00:00 2001 From: Christian Falch <875252+chrfalch@users.noreply.github.com> Date: Sun, 2 Apr 2023 12:23:04 +0200 Subject: Added api for getting/setting the kv_cache (#685) The api provides access methods for retrieving the current memory buffer for the kv_cache and its token number. It also contains a method for setting the kv_cache from a memory buffer. This makes it possible to load/save history - maybe support --cache-prompt paramater as well? Co-authored-by: Pavol Rusnak --- llama.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'llama.h') diff --git a/llama.h b/llama.h index 258de5a9..04e2bf71 100644 --- a/llama.h +++ b/llama.h @@ -83,6 +83,23 @@ extern "C" { const char * fname_out, int itype); + // Returns the KV cache that will contain the context for the + // ongoing prediction with the model. + LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx); + + // Returns the size of the KV cache + LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx); + + // Returns the number of tokens in the KV cache + LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx); + + // Sets the KV cache containing the current context for the model + LLAMA_API void llama_set_kv_cache( + struct llama_context * ctx, + const uint8_t * kv_cache, + size_t n_size, + int n_token_count); + // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process // n_past is the number of tokens to use from previous eval calls -- cgit v1.2.3