diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-02-18 18:39:57 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-18 19:39:57 +0200 |
commit | c145f8a132b2fe1d1e65987faddbd9a40bef7a12 (patch) | |
tree | 721465fe3ef2734fcc5b217855fe2f4290510f23 /examples/server/server.cpp | |
parent | 689a091bbe0537ee9abff3e15a1d74f5f3561165 (diff) |
server : slots monitoring endpoint (#5550)
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8145af86..4f2e9c89 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -41,6 +41,7 @@ struct server_params int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; + bool slots_endpoint = true; }; bool server_verbose = false; @@ -1926,6 +1927,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); + printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); printf("\n"); printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); @@ -2374,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, log_set_target(stdout); LOG_INFO("logging to file is disabled.", {}); } + else if (arg == "--slots-endpoint-disable") + { + sparams.slots_endpoint = false; + } else if (arg == "--chat-template") { if (++i >= argc) @@ -2619,6 +2625,32 @@ int main(int argc, char **argv) } }); + if (sparams.slots_endpoint) { + svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) { + json slots; + for (llama_client_slot & slot : llama.slots) { + json slot_data = llama.get_formated_generation(slot); + slot_data["id"] = slot.id; + slot_data["task_id"] = slot.task_id; + slot_data["state"] = slot.state; + slot_data["prompt"] = slot.prompt; + slot_data["next_token"] = { + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, + {"num_tokens_predicted", slot.n_decoded}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, + }; + + slots.push_back(slot_data); + } + res.set_content(slots.dump(), "application/json"); + res.status = 200; // HTTP OK + }); + } + svr.set_logger(log_server_request); svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep) |