summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPierrick Hymbert <pierrick.hymbert@gmail.com>2024-02-18 17:31:28 +0100
committerGitHub <noreply@github.com>2024-02-18 18:31:28 +0200
commite75c6279d1c8e7abb82a331f5de7124eed402de2 (patch)
tree23890d09bc6e25bad33b008ab571a333e0df1537
parent36376abe05a12a8cb3af548a4af9b8d0e2e69597 (diff)
server : enhanced health endpoint (#5548)
* server: enrich health endpoint with available slots, return 503 if not slots are available * server: document new status no slot available in the README.md
-rw-r--r--examples/server/README.md1
-rw-r--r--examples/server/server.cpp31
2 files changed, 30 insertions, 2 deletions
diff --git a/examples/server/README.md b/examples/server/README.md
index fe5cd8d5..5e3ae833 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -136,6 +136,7 @@ node index.js
- `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
+ - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7aa706e9..8145af86 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2578,8 +2578,35 @@ int main(int argc, char **argv)
server_state current_state = state.load();
switch(current_state) {
case SERVER_STATE_READY:
- res.set_content(R"({"status": "ok"})", "application/json");
- res.status = 200; // HTTP OK
+ if (llama.all_slots_are_idle) {
+ res.set_content(R"({"status": "ok"})", "application/json");
+ res.status = 200; // HTTP OK
+ } else {
+ int available_slots = 0;
+ int processing_slots = 0;
+ for (llama_client_slot & slot : llama.slots) {
+ if (slot.available()) {
+ available_slots++;
+ } else {
+ processing_slots++;
+ }
+ }
+ if (available_slots > 0) {
+ json health = {
+ {"status", "ok"},
+ {"slots_idle", available_slots},
+ {"slots_processing", processing_slots}};
+ res.set_content(health.dump(), "application/json");
+ res.status = 200; // HTTP OK
+ } else {
+ json health = {
+ {"status", "no slot available"},
+ {"slots_idle", available_slots},
+ {"slots_processing", processing_slots}};
+ res.set_content(health.dump(), "application/json");
+ res.status = 503; // HTTP Service Unavailable
+ }
+ }
break;
case SERVER_STATE_LOADING_MODEL:
res.set_content(R"({"status": "loading model"})", "application/json");