llama : refactor k-shift implementation + KV defragmentation (#5691)

* llama : refactor k-shift implementation ggml-ci * llama : rename llama_kv_cache_seq_shift to llama_kv_cache_seq_add * llama : cont k-shift refactoring + normalize type names ggml-ci * minor : fix MPI builds * llama : reuse n_rot from the build context ggml-ci * llama : revert enum name changes from this PR ggml-ci * llama : update llama_rope_type * llama : add comment about rope values * llama : fix build * passkey : apply kv cache updates explicitly ggml-ci * llama : change name to llama_kv_cache_update() * llama : add llama_kv_cache_seq_pos_max() * passkey : fix llama_kv_cache_seq_pos_max() usage * llama : some llama_kv_cell simplifications * llama : add llama_kv_cache_compress (EXPERIMENTAL) * llama : add alternative KV cache merging (EXPERIMENTAL) * llama : add llama_kv_cache_defrag * llama : comments * llama : remove llama_kv_cache_compress will add in a separate PR ggml-ci * llama : defragment via non-overlapping moves * llama : ggml_graph based defrag implementation ggml-ci * llama : switch the loop order in build_defrag * llama : add comments
author: Georgi Gerganov <ggerganov@gmail.com> 2024-02-25 22:12:24 +0200
committer: GitHub <noreply@github.com> 2024-02-25 22:12:24 +0200
commit: bf08e00643fd529f748f0a858fd79f3061e3fa18 (patch)
tree: 0043ee582e83a19c8f1ca6d75d1519038f866e1c /examples/main/main.cpp
parent: f7625019c51ca437a5840576d92362cfa710e4a2 (diff)
1 files changed, 5 insertions, 5 deletions
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7555dffe..34e84d0d 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -548,8 +548,8 @@ int main(int argc, char ** argv) {
                     LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
 
@@ -576,9 +576,9 @@ int main(int argc, char ** argv) {
                     LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
-                    llama_kv_cache_seq_shift(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div  (ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                     n_past -= bd;
author	Georgi Gerganov <ggerganov@gmail.com>	2024-02-25 22:12:24 +0200
committer	GitHub <noreply@github.com>	2024-02-25 22:12:24 +0200
commit	bf08e00643fd529f748f0a858fd79f3061e3fa18 (patch)
tree	0043ee582e83a19c8f1ca6d75d1519038f866e1c /examples/main/main.cpp
parent	f7625019c51ca437a5840576d92362cfa710e4a2 (diff)