diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-02-24 12:28:55 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-24 12:28:55 +0100 |
commit | 525213d2f5da1eaf4b922b6b792cb52b2c613368 (patch) | |
tree | 8400e8a97d231b13a2df0c9d8b7c8fa945d24d5e /examples/server/server.cpp | |
parent | fd43d66f46ee3b5345fb8a74a252d86ccd34a409 (diff) |
server: init functional tests (#5566)
* server: tests: init scenarios
- health and slots endpoints
- completion endpoint
- OAI compatible chat completion requests w/ and without streaming
- completion multi users scenario
- multi users scenario on OAI compatible endpoint with streaming
- multi users with total number of tokens to predict exceeds the KV Cache size
- server wrong usage scenario, like in Infinite loop of "context shift" #3969
- slots shifting
- continuous batching
- embeddings endpoint
- multi users embedding endpoint: Segmentation fault #5655
- OpenAI-compatible embeddings API
- tokenize endpoint
- CORS and api key scenario
* server: CI GitHub workflow
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 36 |
1 files changed, 18 insertions, 18 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 524d0ada..9fb436c2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1410,11 +1410,6 @@ struct llama_server_context int n_processing_slots = 0; for (llama_client_slot &slot: slots) { - if (slot.available()) { - n_idle_slots++; - } else { - n_processing_slots++; - } json slot_data = get_formated_generation(slot); slot_data["id"] = slot.id; slot_data["task_id"] = slot.task_id; @@ -1429,6 +1424,11 @@ struct llama_server_context {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, }; + if (slot_data["state"] == IDLE) { + n_idle_slots++; + } else { + n_processing_slots++; + } slots_data.push_back(slot_data); } LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); @@ -2748,19 +2748,6 @@ int main(int argc, char **argv) log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } - LOG_INFO("HTTP server listening", log_data); - // run the HTTP server in a thread - see comment below - std::thread t([&]() - { - if (!svr.listen_after_bind()) - { - state.store(SERVER_STATE_ERROR); - return 1; - } - - return 0; - }); - // load the model if (!llama.load_model(params)) { @@ -3228,6 +3215,19 @@ int main(int argc, char **argv) }*/ //); + LOG_INFO("HTTP server listening", log_data); + // run the HTTP server in a thread - see comment below + std::thread t([&]() + { + if (!svr.listen_after_bind()) + { + state.store(SERVER_STATE_ERROR); + return 1; + } + + return 0; + }); + llama.queue_tasks.on_new_task(std::bind( &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( |