diff options
Diffstat (limited to 'examples')
-rw-r--r-- | examples/server/README.md | 2 | ||||
-rw-r--r-- | examples/server/server.cpp | 4 |
2 files changed, 3 insertions, 3 deletions
diff --git a/examples/server/README.md b/examples/server/README.md index e17595fe..cb1eb7c9 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -450,7 +450,7 @@ node index.js `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` - `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` + `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 360f571e..466bb339 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -112,7 +112,7 @@ struct server_task_multi { struct slot_params { bool stream = true; - bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half @@ -905,7 +905,7 @@ struct server_context { } slot.params.stream = json_value(data, "stream", false); - slot.params.cache_prompt = json_value(data, "cache_prompt", false); + slot.params.cache_prompt = json_value(data, "cache_prompt", true); slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); |