From 525213d2f5da1eaf4b922b6b792cb52b2c613368 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 24 Feb 2024 12:28:55 +0100
Subject: server: init functional tests (#5566)

* server: tests: init scenarios
 - health and slots endpoints
 - completion endpoint
 - OAI compatible chat completion requests w/ and without streaming
 - completion multi users scenario
 - multi users scenario on OAI compatible endpoint with streaming
 - multi users with total number of tokens to predict exceeds the KV Cache size
 - server wrong usage scenario, like in Infinite loop of "context shift" #3969
 - slots shifting
 - continuous batching
 - embeddings endpoint
 - multi users embedding endpoint: Segmentation fault #5655
 - OpenAI-compatible embeddings API
 - tokenize endpoint
 - CORS and api key scenario

* server: CI GitHub workflow


---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/server/server.cpp | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'examples/server/server.cpp')

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 524d0ada..9fb436c2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1410,11 +1410,6 @@ struct llama_server_context
                 int n_processing_slots = 0;
 
                 for (llama_client_slot &slot: slots) {
-                    if (slot.available()) {
-                        n_idle_slots++;
-                    } else {
-                        n_processing_slots++;
-                    }
                     json slot_data = get_formated_generation(slot);
                     slot_data["id"] = slot.id;
                     slot_data["task_id"] = slot.task_id;
@@ -1429,6 +1424,11 @@ struct llama_server_context
                             {"stopped_limit", slot.stopped_limit},
                             {"stopping_word", slot.stopping_word},
                     };
+                    if (slot_data["state"] == IDLE) {
+                        n_idle_slots++;
+                    } else {
+                        n_processing_slots++;
+                    }
                     slots_data.push_back(slot_data);
                 }
                 LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
@@ -2748,19 +2748,6 @@ int main(int argc, char **argv)
         log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
     }
 
-    LOG_INFO("HTTP server listening", log_data);
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]()
-            {
-                if (!svr.listen_after_bind())
-                {
-                    state.store(SERVER_STATE_ERROR);
-                    return 1;
-                }
-
-                return 0;
-            });
-
     // load the model
     if (!llama.load_model(params))
     {
@@ -3228,6 +3215,19 @@ int main(int argc, char **argv)
     }*/
     //);
 
+    LOG_INFO("HTTP server listening", log_data);
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]()
+            {
+                if (!svr.listen_after_bind())
+                {
+                    state.store(SERVER_STATE_ERROR);
+                    return 1;
+                }
+
+                return 0;
+            });
+
     llama.queue_tasks.on_new_task(std::bind(
         &llama_server_context::process_single_task, &llama, std::placeholders::_1));
     llama.queue_tasks.on_finish_multitask(std::bind(
-- 
cgit v1.2.3