summaryrefslogtreecommitdiff
path: root/examples/server/server.cpp
diff options
context:
space:
mode:
authorfirecoperana <xuqiaowei1124@gmail.com>2025-06-08 11:38:47 +0000
committerGitHub <noreply@github.com>2025-06-08 14:38:47 +0300
commitdf170c83a554df526e25a825389e692669644c85 (patch)
tree962efa23b4a7f341f5578ddfc8e171ecdbf8f869 /examples/server/server.cpp
parent9e567e385adacbc4710e94ee7223c5f6b0404699 (diff)
Webui improvement (#481)
* update webui * add token/s in webui * add webui files * fix webui first message disappear in some browser * add missing html files --------- Co-authored-by: firecoperana <firecoperana>
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r--examples/server/server.cpp329
1 files changed, 289 insertions, 40 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8724e8d8..acd0581e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,3 +1,4 @@
+#pragma warning(disable : 4996)
#include "utils.hpp"
#include "common.h"
@@ -15,23 +16,8 @@
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
-
-// auto generated files (update with ./deps.sh)
-#include "colorthemes.css.hpp"
-#include "style.css.hpp"
-#include "theme-beeninorder.css.hpp"
-#include "theme-ketivah.css.hpp"
-#include "theme-mangotango.css.hpp"
-#include "theme-playground.css.hpp"
-#include "theme-polarnight.css.hpp"
-#include "theme-snowstorm.css.hpp"
-#include "index.html.hpp"
-#include "index-new.html.hpp"
-#include "index.js.hpp"
-#include "completion.js.hpp"
-#include "system-prompts.js.hpp"
-#include "prompt-formats.js.hpp"
-#include "json-schema-to-grammar.mjs.hpp"
+#include "index.html.gz.hpp"
+#include "loading.html.hpp"
#include <atomic>
#include <chrono>
@@ -42,12 +28,14 @@
#include <thread>
#include <signal.h>
#include <memory>
+#include <src/llama-impl.h>
using json = nlohmann::ordered_json;
bool server_verbose = false;
bool server_log_json = true;
+
enum stop_type {
STOP_TYPE_FULL,
STOP_TYPE_PARTIAL,
@@ -81,6 +69,44 @@ enum server_task_type {
SERVER_TASK_TYPE_SET_LORA,
};
+
+struct result_timings {
+ int32_t prompt_n = -1;
+ double prompt_ms;
+ double prompt_per_token_ms;
+ double prompt_per_second;
+
+ int32_t predicted_n = -1;
+ double predicted_ms;
+ double predicted_per_token_ms;
+ double predicted_per_second;
+
+ // Optional speculative metrics - only included when > 0
+ int32_t draft_n = 0;
+ int32_t draft_n_accepted = 0;
+
+ json to_json() const {
+ json base = {
+ {"prompt_n", prompt_n},
+ {"prompt_ms", prompt_ms},
+ {"prompt_per_token_ms", prompt_per_token_ms},
+ {"prompt_per_second", prompt_per_second},
+
+ {"predicted_n", predicted_n},
+ {"predicted_ms", predicted_ms},
+ {"predicted_per_token_ms", predicted_per_token_ms},
+ {"predicted_per_second", predicted_per_second},
+ };
+
+ if (draft_n > 0) {
+ base["draft_n"] = draft_n;
+ base["draft_n_accepted"] = draft_n_accepted;
+ }
+
+ return base;
+ }
+};
+
struct server_task {
int id = -1; // to be filled by server_queue
int id_multi = -1;
@@ -101,8 +127,13 @@ struct server_task_result {
bool stop;
bool error;
+ result_timings timings;
+
};
+std::unordered_map<int, server_task_result > server_task_result_dict = {};
+
+
struct server_task_multi {
int id = -1;
@@ -120,6 +151,7 @@ struct slot_params {
std::vector<std::string> antiprompt;
+ bool timings_per_token = false;
json input_prefix;
json input_suffix;
};
@@ -262,6 +294,27 @@ struct server_slot {
};
}
+ result_timings get_timings() const {
+ result_timings timings;
+ timings.prompt_n = n_prompt_tokens_processed;
+ timings.prompt_ms = t_prompt_processing;
+ timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
+ timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+
+ timings.predicted_n = n_decoded;
+ timings.predicted_ms = (ggml_time_us() - t_start_generation) / 1e3;
+ timings.predicted_per_token_ms = t_token_generation / n_decoded;
+ timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
+
+ //// Add speculative metrics
+ //if (n_draft_total > 0) {
+ // timings.draft_n = n_draft_total;
+ // timings.draft_n_accepted = n_draft_accepted;
+ //}
+
+ return timings;
+ }
size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) {
size_t stop_pos = std::string::npos;
@@ -903,7 +956,7 @@ struct server_context {
slot.oaicompat = false;
slot.oaicompat_model = "";
}
-
+ slot.params.timings_per_token = json_value(data, "timings_per_token", false);
slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
@@ -1423,8 +1476,13 @@ struct server_context {
res.data["oaicompat_token_ctr"] = slot.n_decoded;
res.data["model"] = slot.oaicompat_model;
}
-
- queue_results.send(res);
+ // populate timings if this is final response or timings_per_token is enabled
+ if (slot.params.timings_per_token) {
+ //res.data["timings"] = slot.get_formated_timings();
+ res.timings = slot.get_timings();
+ }
+ server_task_result_dict[slot.id_task] = res;
+ queue_results.send(std::move(res));
}
void send_final_response(const server_slot & slot) {
@@ -2465,6 +2523,188 @@ struct server_context {
}
};
+static json format_final_response_oaicompat(const json& request, json result, const std::string& completion_id, bool streaming = false) {
+ bool stopped_word = result.count("stopped_word") != 0;
+ bool stopped_eos = json_value(result, "stopped_eos", false);
+ int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+ int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
+ std::string content = json_value(result, "content", std::string(""));
+
+ std::string finish_reason = "length";
+ if (stopped_word || stopped_eos) {
+ finish_reason = "stop";
+ }
+
+ json choices =
+ streaming ? json::array({ json{{"finish_reason", finish_reason},
+ {"index", 0},
+ {"delta", json::object()}} })
+ : json::array({ json{{"finish_reason", finish_reason},
+ {"index", 0},
+ {"message", json{{"content", content},
+ {"role", "assistant"}}}} });
+
+ std::time_t t = std::time(0);
+
+ json res = json{
+ {"choices", choices},
+ {"created", t},
+ {"model",
+ json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+ {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+ {"usage", json {
+ {"completion_tokens", num_tokens_predicted},
+ {"prompt_tokens", num_prompt_tokens},
+ {"total_tokens", num_tokens_predicted + num_prompt_tokens}
+ }},
+ {"id", completion_id}
+ };
+
+ if (server_verbose) {
+ res["__verbose"] = result;
+ }
+
+ if (result.contains("completion_probabilities")) {
+ res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+ }
+
+ return res;
+}
+
+// return value is vector as there is one case where we might need to generate two responses
+static std::vector<json> format_partial_response_oaicompat(server_task_result task_result, const std::string& completion_id) {
+ json result = task_result.data;
+ if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+ return std::vector<json>({ result });
+ }
+
+ bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
+ std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+
+ bool stopped_word = json_value(result, "stopped_word", false);
+ bool stopped_eos = json_value(result, "stopped_eos", false);
+ bool stopped_limit = json_value(result, "stopped_limit", false);
+ std::string content = json_value(result, "content", std::string(""));
+
+ std::string finish_reason;
+ if (stopped_word || stopped_eos) {
+ finish_reason = "stop";
+ }
+ if (stopped_limit) {
+ finish_reason = "length";
+ }
+
+ std::time_t t = std::time(0);
+
+ json choices;
+
+ if (!finish_reason.empty()) {
+ choices = json::array({ json{{"finish_reason", finish_reason},
+ {"index", 0},
+ {"delta", json::object()}} });
+ }
+ else {
+ if (first) {
+ if (content.empty()) {
+ choices = json::array({ json{{"finish_reason", nullptr},
+ {"index", 0},
+ {"delta", json{{"role", "assistant"}}}} });
+ }
+ else {
+ // We have to send this as two updates to conform to openai behavior
+ json initial_ret = json{ {"choices", json::array({json{
+ {"finish_reason", nullptr},
+ {"index", 0},
+ {"delta", json{
+ {"role", "assistant"}
+ }}}})},
+ {"created", t},
+ {"id", completion_id},
+ {"model", modelname},
+ {"object", "chat.completion.chunk"} };
+
+ json second_ret = json{
+ {"choices", json::array({json{{"finish_reason", nullptr},
+ {"index", 0},
+ {"delta", json{
+ {"content", content}}}
+ }})},
+ {"created", t},
+ {"id", completion_id},
+ {"model", modelname},
+ {"object", "chat.completion.chunk"} };
+
+ return std::vector<json>({ initial_ret, second_ret });
+ }
+ }
+ else {
+ // Some idiosyncrasy in task processing logic makes several trailing calls
+ // with empty content, we ignore these at the calee site.
+ if (content.empty()) {
+ return std::vector<json>({ json::object() });
+ }
+
+ choices = json::array({ json{
+ {"finish_reason", nullptr},
+ {"index", 0},
+ {"delta",
+ json{
+ {"content", content},
+ }},
+ } });
+ }
+ }
+
+ json ret = json{
+ {"choices", choices},
+ {"created", t},
+ {"id", completion_id},
+ {"model", modelname},
+ {"object", "chat.completion.chunk"}
+ };
+ if (server_task_result_dict.count(task_result.id) > 0)
+ {
+ ret.push_back({ "timings", server_task_result_dict[task_result.id].timings.to_json() });
+ }
+
+ //
+ if (!finish_reason.empty()) {
+ int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+ int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
+ ret.push_back({ "usage", json {
+ {"completion_tokens", num_tokens_predicted},
+ {"prompt_tokens", num_prompt_tokens},
+ {"total_tokens", num_tokens_predicted + num_prompt_tokens}
+ } });
+ }
+
+ return std::vector<json>({ ret });
+}
+
+
+static json format_embeddings_response_oaicompat(const json& request, const json& embeddings) {
+ json data = json::array();
+ int i = 0;
+ for (auto& elem : embeddings) {
+ data.push_back(json{
+ {"embedding", json_value(elem, "embedding", json::array())},
+ {"index", i++},
+ {"object", "embedding"}
+ });
+ }
+
+ json res = json{
+ {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+ {"object", "list"},
+ {"usage", json {
+ {"prompt_tokens", 0},
+ {"total_tokens", 0}
+ }},
+ {"data", data}
+ };
+
+ return res;
+}
static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
// skip GH copilot requests when using default port
if (req.path == "/v1/health" || req.path == "/v1/completions") {
@@ -3121,6 +3361,7 @@ int main(int argc, char ** argv) {
res.set_content(models.dump(), "application/json; charset=utf-8");
};
+
const auto handle_chat_completions = [&ctx_server, &params, &res_error](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
@@ -3152,7 +3393,7 @@ int main(int argc, char ** argv) {
while (true) {
server_task_result result = ctx_server.queue_results.recv(id_task);
if (!result.error) {
- std::vector<json> result_array = format_partial_response_oaicompat(result.data, completion_id);
+ std::vector<json> result_array = format_partial_response_oaicompat(result, completion_id);
for (auto it = result_array.begin(); it != result_array.end(); ++it) {
if (!it->empty()) {
@@ -3449,25 +3690,33 @@ int main(int argc, char ** argv) {
svr->set_base_dir(params.public_path);
}
- // using embedded static files
- svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
- svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
-
- // add new-ui files
- svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
- svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
- svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
- svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
- svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
-
+ {
+ // register static assets routes
+ if (!params.public_path.empty()) {
+ // Set the base directory for serving static files
+ bool is_found = svr->set_mount_point("/", params.public_path);
+ if (!is_found) {
+ GGML_ABORT("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+ return 1;
+ }
+ }
+ else {
+ // using embedded static index.html
+ svr->Get("/", [](const httplib::Request& req, httplib::Response& res) {
+ if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
+ res.set_content("Error: gzip is not supported by this browser", "text/plain");
+ }
+ else {
+ res.set_header("Content-Encoding", "gzip");
+ // COEP and COOP headers, required by pyodide (python interpreter)
+ res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+ res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+ res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
+ }
+ return false;
+ });
+ }
+ }
// register API routes
svr->Get ("/health", handle_health);
svr->Get ("/metrics", handle_metrics);