summaryrefslogtreecommitdiff
path: root/llama.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.h')
-rw-r--r--llama.h57
1 files changed, 54 insertions, 3 deletions
diff --git a/llama.h b/llama.h
index 70e8fda4..1a62058d 100644
--- a/llama.h
+++ b/llama.h
@@ -361,9 +361,60 @@ extern "C" {
// KV cache
//
- // Returns the number of tokens in the KV cache
- LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
- "avoid using this, it will be removed in the future, instead - count the tokens in user code");
+ // Information associated with an individual cell in the KV cache view.
+ struct llama_kv_cache_view_cell {
+ // The position for this cell. Takes KV cache shifts into account.
+ // May be negative if the cell is not populated.
+ llama_pos pos;
+ };
+
+ // An updateable view of the KV cache.
+ struct llama_kv_cache_view {
+ // Number of KV cache cells. This will be the same as the context size.
+ int32_t n_cells;
+
+ // Maximum number of sequences that can exist in a cell. It's not an error
+ // if there are more sequences in a cell than this value, however they will
+ // not be visible in the view cells_sequences.
+ int32_t n_max_seq;
+
+ // Number of tokens in the cache. For example, if there are two populated
+ // cells, the first with 1 sequence id in it and the second with 2 sequence
+ // ids then you'll have 3 tokens.
+ int32_t token_count;
+
+ // Number of populated cache cells.
+ int32_t used_cells;
+
+ // Maximum contiguous empty slots in the cache.
+ int32_t max_contiguous;
+
+ // Index to the start of the max_contiguous slot range. Can be negative
+ // when cache is full.
+ int32_t max_contiguous_idx;
+
+ // Information for an individual cell.
+ struct llama_kv_cache_view_cell * cells;
+
+ // The sequences for each cell. There will be n_max_seq items per cell.
+ llama_seq_id * cells_sequences;
+ };
+
+ // Create an empty KV cache view. (use only for debugging purposes)
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+
+ // Free a KV cache view. (use only for debugging purposes)
+ LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+ // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+ LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
+ // Returns the number of tokens in the KV cache (slow, use only for debug)
+ // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+
+ // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+ LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
// Clear the KV cache
LLAMA_API void llama_kv_cache_clear(