summaryrefslogtreecommitdiff
path: root/examples/server/server.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-01-07 08:45:26 +0200
committerGitHub <noreply@github.com>2024-01-07 08:45:26 +0200
commit67984921a70a7e680a24494aeb7575a66e90685d (patch)
treebc793325b1a12bec84c14c06f40f3ed8b14faa5c /examples/server/server.cpp
parentc75ca5d96f902564cbbbdd7f5cade80d53c288bb (diff)
server : fix n_predict check (#4798)
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r--examples/server/server.cpp15
1 files changed, 11 insertions, 4 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d1469fb0..6c7fcd17 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -447,8 +447,14 @@ struct llama_client_slot
}
bool has_budget(gpt_params &global_params) {
+ if (params.n_predict == -1 && global_params.n_predict == -1)
+ {
+ return true; // limitless
+ }
+
n_remaining = -1;
- if(params.n_predict != -1)
+
+ if (params.n_predict != -1)
{
n_remaining = params.n_predict - n_decoded;
}
@@ -456,7 +462,8 @@ struct llama_client_slot
{
n_remaining = global_params.n_predict - n_decoded;
}
- return n_remaining > 0 || n_remaining == -1; // no budget || limitless
+
+ return n_remaining > 0; // no budget
}
bool available() const {
@@ -1102,7 +1109,7 @@ struct llama_server_context
}
// check the limits
- if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params))
+ if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params))
{
slot.stopped_limit = true;
slot.has_next_token = false;
@@ -1703,7 +1710,6 @@ struct llama_server_context
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true);
- slot.n_decoded += 1;
slot.n_past += 1;
}
@@ -1921,6 +1927,7 @@ struct llama_server_context
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+ slot.n_decoded += 1;
if (slot.n_decoded == 1)
{
slot.t_start_genereration = ggml_time_us();