summaryrefslogtreecommitdiff
path: root/examples/server/server.cpp
diff options
context:
space:
mode:
authorPierrick Hymbert <pierrick.hymbert@gmail.com>2024-02-18 18:39:57 +0100
committerGitHub <noreply@github.com>2024-02-18 19:39:57 +0200
commitc145f8a132b2fe1d1e65987faddbd9a40bef7a12 (patch)
tree721465fe3ef2734fcc5b217855fe2f4290510f23 /examples/server/server.cpp
parent689a091bbe0537ee9abff3e15a1d74f5f3561165 (diff)
server : slots monitoring endpoint (#5550)
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r--examples/server/server.cpp32
1 files changed, 32 insertions, 0 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8145af86..4f2e9c89 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -41,6 +41,7 @@ struct server_params
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
+ bool slots_endpoint = true;
};
bool server_verbose = false;
@@ -1926,6 +1927,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
+ printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
printf("\n");
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n");
@@ -2374,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
}
+ else if (arg == "--slots-endpoint-disable")
+ {
+ sparams.slots_endpoint = false;
+ }
else if (arg == "--chat-template")
{
if (++i >= argc)
@@ -2619,6 +2625,32 @@ int main(int argc, char **argv)
}
});
+ if (sparams.slots_endpoint) {
+ svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
+ json slots;
+ for (llama_client_slot & slot : llama.slots) {
+ json slot_data = llama.get_formated_generation(slot);
+ slot_data["id"] = slot.id;
+ slot_data["task_id"] = slot.task_id;
+ slot_data["state"] = slot.state;
+ slot_data["prompt"] = slot.prompt;
+ slot_data["next_token"] = {
+ {"has_next_token", slot.has_next_token},
+ {"n_remain", slot.n_remaining},
+ {"num_tokens_predicted", slot.n_decoded},
+ {"stopped_eos", slot.stopped_eos},
+ {"stopped_word", slot.stopped_word},
+ {"stopped_limit", slot.stopped_limit},
+ {"stopping_word", slot.stopping_word},
+ };
+
+ slots.push_back(slot_data);
+ }
+ res.set_content(slots.dump(), "application/json");
+ res.status = 200; // HTTP OK
+ });
+ }
+
svr.set_logger(log_server_request);
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)