server : slots monitoring endpoint (#5550)

author: Pierrick Hymbert <pierrick.hymbert@gmail.com> 2024-02-18 18:39:57 +0100
committer: GitHub <noreply@github.com> 2024-02-18 19:39:57 +0200
commit: c145f8a132b2fe1d1e65987faddbd9a40bef7a12 (patch)
tree: 721465fe3ef2734fcc5b217855fe2f4290510f23 /examples/server/server.cpp
parent: 689a091bbe0537ee9abff3e15a1d74f5f3561165 (diff)
1 files changed, 32 insertions, 0 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8145af86..4f2e9c89 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -41,6 +41,7 @@ struct server_params
     int32_t port = 8080;
     int32_t read_timeout = 600;
     int32_t write_timeout = 600;
+    bool slots_endpoint = true;
 };
 
 bool server_verbose = false;
@@ -1926,6 +1927,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
     printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
     printf("  --log-disable             disables logging to a file.\n");
+    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
     printf("\n");
     printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
     printf("  --override-kv KEY=TYPE:VALUE\n");
@@ -2374,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             log_set_target(stdout);
             LOG_INFO("logging to file is disabled.", {});
         }
+        else if (arg == "--slots-endpoint-disable")
+        {
+            sparams.slots_endpoint = false;
+        }
         else if (arg == "--chat-template")
         {
             if (++i >= argc)
@@ -2619,6 +2625,32 @@ int main(int argc, char **argv)
         }
     });
 
+    if (sparams.slots_endpoint) {
+        svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
+            json slots;
+            for (llama_client_slot & slot : llama.slots) {
+                json slot_data = llama.get_formated_generation(slot);
+                slot_data["id"] = slot.id;
+                slot_data["task_id"] = slot.task_id;
+                slot_data["state"] = slot.state;
+                slot_data["prompt"] = slot.prompt;
+                slot_data["next_token"] = {
+                        {"has_next_token", slot.has_next_token},
+                        {"n_remain", slot.n_remaining},
+                        {"num_tokens_predicted", slot.n_decoded},
+                        {"stopped_eos", slot.stopped_eos},
+                        {"stopped_word", slot.stopped_word},
+                        {"stopped_limit", slot.stopped_limit},
+                        {"stopping_word", slot.stopping_word},
+                };
+
+                slots.push_back(slot_data);
+            }
+            res.set_content(slots.dump(), "application/json");
+            res.status = 200; // HTTP OK
+        });
+    }
+
     svr.set_logger(log_server_request);
 
     svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
author	Pierrick Hymbert <pierrick.hymbert@gmail.com>	2024-02-18 18:39:57 +0100
committer	GitHub <noreply@github.com>	2024-02-18 19:39:57 +0200
commit	c145f8a132b2fe1d1e65987faddbd9a40bef7a12 (patch)
tree	721465fe3ef2734fcc5b217855fe2f4290510f23 /examples/server/server.cpp
parent	689a091bbe0537ee9abff3e15a1d74f5f3561165 (diff)