summaryrefslogtreecommitdiff
path: root/examples/server/server.cpp
diff options
context:
space:
mode:
authorPierrick Hymbert <pierrick.hymbert@gmail.com>2024-02-24 12:28:55 +0100
committerGitHub <noreply@github.com>2024-02-24 12:28:55 +0100
commit525213d2f5da1eaf4b922b6b792cb52b2c613368 (patch)
tree8400e8a97d231b13a2df0c9d8b7c8fa945d24d5e /examples/server/server.cpp
parentfd43d66f46ee3b5345fb8a74a252d86ccd34a409 (diff)
server: init functional tests (#5566)
* server: tests: init scenarios - health and slots endpoints - completion endpoint - OAI compatible chat completion requests w/ and without streaming - completion multi users scenario - multi users scenario on OAI compatible endpoint with streaming - multi users with total number of tokens to predict exceeds the KV Cache size - server wrong usage scenario, like in Infinite loop of "context shift" #3969 - slots shifting - continuous batching - embeddings endpoint - multi users embedding endpoint: Segmentation fault #5655 - OpenAI-compatible embeddings API - tokenize endpoint - CORS and api key scenario * server: CI GitHub workflow --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r--examples/server/server.cpp36
1 files changed, 18 insertions, 18 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 524d0ada..9fb436c2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1410,11 +1410,6 @@ struct llama_server_context
int n_processing_slots = 0;
for (llama_client_slot &slot: slots) {
- if (slot.available()) {
- n_idle_slots++;
- } else {
- n_processing_slots++;
- }
json slot_data = get_formated_generation(slot);
slot_data["id"] = slot.id;
slot_data["task_id"] = slot.task_id;
@@ -1429,6 +1424,11 @@ struct llama_server_context
{"stopped_limit", slot.stopped_limit},
{"stopping_word", slot.stopping_word},
};
+ if (slot_data["state"] == IDLE) {
+ n_idle_slots++;
+ } else {
+ n_processing_slots++;
+ }
slots_data.push_back(slot_data);
}
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
@@ -2748,19 +2748,6 @@ int main(int argc, char **argv)
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
}
- LOG_INFO("HTTP server listening", log_data);
- // run the HTTP server in a thread - see comment below
- std::thread t([&]()
- {
- if (!svr.listen_after_bind())
- {
- state.store(SERVER_STATE_ERROR);
- return 1;
- }
-
- return 0;
- });
-
// load the model
if (!llama.load_model(params))
{
@@ -3228,6 +3215,19 @@ int main(int argc, char **argv)
}*/
//);
+ LOG_INFO("HTTP server listening", log_data);
+ // run the HTTP server in a thread - see comment below
+ std::thread t([&]()
+ {
+ if (!svr.listen_after_bind())
+ {
+ state.store(SERVER_STATE_ERROR);
+ return 1;
+ }
+
+ return 0;
+ });
+
llama.queue_tasks.on_new_task(std::bind(
&llama_server_context::process_single_task, &llama, std::placeholders::_1));
llama.queue_tasks.on_finish_multitask(std::bind(