summaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
Diffstat (limited to 'common')
-rw-r--r--common/common.cpp79
-rw-r--r--common/common.h11
2 files changed, 90 insertions, 0 deletions
diff --git a/common/common.cpp b/common/common.cpp
index eec704b9..1dcc235e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -12,6 +12,7 @@
#include <regex>
#include <sstream>
#include <string>
+#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <cinttypes>
@@ -495,6 +496,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.chatml = true;
} else if (arg == "--infill") {
params.infill = true;
+ } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
+ params.dump_kv_cache = true;
} else if (arg == "--multiline-input") {
params.multiline_input = true;
} else if (arg == "--simple-io") {
@@ -835,6 +838,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif // GGML_USE_CUBLAS
#endif
printf(" --verbose-prompt print prompt before generation\n");
+ printf(" -dkvc, --dump-kv-cache\n");
+ printf(" verbose print of the KV cache\n");
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@@ -1386,3 +1391,77 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
}
+
+//
+// KV cache utils
+//
+
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
+ static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
+
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
+ view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+ llama_kv_cache_view_cell * c_curr = view.cells;
+ llama_seq_id * cs_curr = view.cells_sequences;
+
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+ if (i % row_size == 0) {
+ printf("\n%5d: ", i);
+ }
+ int seq_count = 0;
+ for (int j = 0; j < view.n_max_seq; j++) {
+ if (cs_curr[j] >= 0) { seq_count++; }
+ }
+ putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
+ }
+
+ printf("\n=== Done dumping\n");
+}
+
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
+ static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
+ view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+
+ std::unordered_map<llama_seq_id, size_t> seqs;
+ llama_kv_cache_view_cell * c_curr = view.cells;
+ llama_seq_id * cs_curr = view.cells_sequences;
+
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+ for (int j = 0; j < view.n_max_seq; j++) {
+ if (cs_curr[j] < 0) { continue; }
+ if (seqs.find(cs_curr[j]) == seqs.end()) {
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+ seqs[cs_curr[j]] = seqs.size();
+ }
+ }
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+ }
+
+ printf("=== Sequence legend: ");
+ for (const auto & it : seqs) {
+ printf("%zu=%d, ", it.second, it.first);
+ }
+ printf("'+'=other sequence ids");
+
+ c_curr = view.cells;
+ cs_curr = view.cells_sequences;
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+ if (i % row_size == 0) {
+ printf("\n%5d: ", i);
+ }
+ for (int j = 0; j < view.n_max_seq; j++) {
+ if (cs_curr[j] >= 0) {
+ const auto & it = seqs.find(cs_curr[j]);
+ putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
+ } else {
+ putchar('.');
+ }
+ }
+ putchar(' ');
+ }
+
+ printf("\n=== Done dumping\n");
+}
diff --git a/common/common.h b/common/common.h
index 88fa13fc..2f6fe48a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -122,6 +122,7 @@ struct gpt_params {
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation
bool infill = false; // use infill mode
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
@@ -218,3 +219,13 @@ std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
+
+//
+// KV cache utils
+//
+
+// Dump the KV cache view with the number of sequences per cell.
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+
+// Dump the KV cache view showing individual sequences in each cell (long output).
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);