summaryrefslogtreecommitdiff
path: root/examples/server/server.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-10-12 09:29:04 +0300
committerGitHub <noreply@github.com>2023-10-12 09:29:04 +0300
commit57dd55e2c742bfc50e0f5c6fb95c14118cff44f6 (patch)
tree9e59457c5d640bdaf5e7fff4a54856c84a175216 /examples/server/server.cpp
parentb8fe4b5cc9cb237ca98e5bc51b5d189e3c446d13 (diff)
server : fix kv cache management (#3588)
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r--examples/server/server.cpp10
1 files changed, 7 insertions, 3 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d992feee..ee0ababb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -405,6 +405,7 @@ struct llama_server_context
// compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens);
embd = prompt_tokens;
+
if (n_past == num_prompt_tokens)
{
// we have to evaluate at least 1 token to generate logits.
@@ -412,6 +413,9 @@ struct llama_server_context
n_past--;
}
+ // since #3228 we now have to manually manage the KV cache
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
LOG_VERBOSE("prompt ingested", {
{"n_past", n_past},
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -461,9 +465,6 @@ struct llama_server_context
// compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens);
- // since #3228 we now have to manually manage the KV cache
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
embd = prompt_tokens;
if (n_past == num_prompt_tokens)
{
@@ -471,6 +472,9 @@ struct llama_server_context
n_past--;
}
+ // since #3228 we now have to manually manage the KV cache
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
LOG_VERBOSE("prompt ingested", {
{"n_past", n_past},
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},