diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-02-25 13:50:32 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-25 13:50:32 +0100 |
commit | 930b1780269a69948d106e2d1b838ab7661f679a (patch) | |
tree | cabd341427318aeb2c9c5e91d24c7eb9505c883d /examples/server/server.cpp | |
parent | d52d7819b8ced70c642a88a59da8c78208dc58ec (diff) |
server: logs - unified format and --log-format option (#5700)
* server: logs - always use JSON logger, add add thread_id in message, log task_id and slot_id
* server : skip GH copilot requests from logging
* server : change message format of server_log()
* server : no need to repeat log in comment
* server : log style consistency
* server : fix compile warning
* server : fix tests regex patterns on M2 Ultra
* server: logs: PR feedback on log level
* server: logs: allow to choose log format in json or plain text
* server: tests: output server logs in text
* server: logs switch init logs to server logs macro
* server: logs ensure value json value does not raised error
* server: logs reduce level VERBOSE to VERB to max 4 chars
* server: logs lower case as other log messages
* server: logs avoid static in general
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* server: logs PR feedback: change text log format to: LEVEL [function_name] message | additional=data
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 218 |
1 files changed, 169 insertions, 49 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 81149591..d970202d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -47,6 +47,7 @@ struct server_params }; bool server_verbose = false; +bool server_log_json = true; static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b) { @@ -302,12 +303,43 @@ struct llama_client_slot } void print_timings() const { - LOG_TEE("\n"); - LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); - LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded); - LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); + char buffer[512]; + double t_token = t_prompt_processing / num_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, num_prompt_tokens_processed, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"num_prompt_tokens_processed", num_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); } }; @@ -399,7 +431,7 @@ struct llama_server_context params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_TEE("Multi Modal Mode Enabled"); + LOG_INFO("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); @@ -452,7 +484,7 @@ struct llama_server_context const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_TEE("Available slots:\n"); + LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { llama_client_slot slot; @@ -461,7 +493,10 @@ struct llama_server_context slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + LOG_INFO("new slot", { + {"slot_id", slot.id}, + {"n_ctx_slot", slot.n_ctx} + }); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -471,7 +506,12 @@ struct llama_server_context GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); + + LOG_INFO("slot self-extend", { + {"slot_id", slot.id}, + {"ga_n", ga_n}, + {"ga_w", ga_w} + }); } slot.ga_i = 0; @@ -765,10 +805,16 @@ struct llama_server_context img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); + LOG_ERROR("failed to load image", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); return false; } - LOG_TEE("slot %i - loaded image\n", slot->id); + LOG_VERBOSE("image loaded", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -828,7 +874,10 @@ struct llama_server_context all_slots_are_idle = false; - LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + LOG_INFO("slot is processing task", { + {"slot_id", slot->id}, + {"task_id", slot->task_id}, + }); return true; } @@ -1391,7 +1440,7 @@ struct llama_server_context if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {}); + LOG_VERBOSE("no slot is available", {{"task_id", task.id}}); queue_tasks.defer(task); break; } @@ -1467,7 +1516,17 @@ struct llama_server_context } slots_data.push_back(slot_data); } - LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); + LOG_INFO("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots} + }); + LOG_VERBOSE("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots}, + {"slots", slots_data} + }); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; @@ -1519,7 +1578,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_TEE("updating system prompt\n"); + LOG_INFO("updating system prompt", {}); update_system_prompt(); } @@ -1529,12 +1588,13 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); + LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); kv_cache_clear(); } return true; } + LOG_VERBOSE("posting NEXT_RESPONSE", {}); task_server task; task.type = TASK_TYPE_NEXT_RESPONSE; task.target_id = -1; @@ -1548,10 +1608,20 @@ struct llama_server_context { // Shift context const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = system_tokens.size() + slot.n_past - n_keep; + const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); + LOG_INFO("slot context shift", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_keep", n_keep}, + {"n_left", n_left}, + {"n_discard", n_discard}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()} + }); llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1565,17 +1635,12 @@ struct llama_server_context slot.n_past -= n_discard; slot.truncated = true; - - LOG_VERBOSE("context shift", { - { "n_ctx", n_ctx }, - { "n_keep", n_keep }, - { "n_left", n_left }, - }); } } } // decode any currently ongoing sequences + LOG_VERBOSE("decoding ongoing sequences", {}); for (auto & slot : slots) { // release the slot @@ -1585,7 +1650,15 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + LOG_INFO("slot released", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()}, + {"truncated", slot.truncated} + }); queue_tasks.notify_slot_changed(); continue; @@ -1733,7 +1806,12 @@ struct llama_server_context slot.ga_i = ga_i; } - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); + LOG_INFO("slot progression", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "n_past", slot.n_past }, + { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } + }); } slot.cache_tokens = prompt_tokens; @@ -1741,7 +1819,10 @@ struct llama_server_context if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); + LOG_INFO("we have to evaluate at least 1 token to generate logits", { + { "slot_id", slot.id }, + { "task_id", slot.task_id } + }); slot.n_past--; if (slot.ga_i > 0) { @@ -1749,9 +1830,13 @@ struct llama_server_context } } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + int p0 = (int) system_tokens.size() + slot.n_past; + LOG_INFO("kv cache rm [p0, end)", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "p0", p0 } + }); + llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); LOG_VERBOSE("prompt ingested", { {"n_past", slot.n_past}, @@ -1786,7 +1871,13 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { - LOG_TEE("failed processing images\n"); + LOG_ERROR("failed processing images", { + "slot_id", slot.id, + "task_id", slot.task_id, + }); + // FIXME @phymbert: to be properly tested + // early returning without changing the slot state will block the slot for ever + // no one at the moment is checking the return value return false; } @@ -1928,6 +2019,8 @@ struct llama_server_context slot.i_batch = -1; } } + + LOG_VERBOSE("slots updated", {}); return true; } @@ -2005,6 +2098,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -ctv TYPE, --cache-type-v TYPE\n"); printf(" KV cache data type for V (default: f16)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); + printf(" --log-format log output format: json or text (default: json)\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled"); @@ -2458,6 +2552,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.mmproj = argv[i]; } + else if (arg == "--log-format") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + if (std::strcmp(argv[i], "json") == 0) + { + server_log_json = true; + } + else if (std::strcmp(argv[i], "text") == 0) + { + server_log_json = false; + } + else + { + invalid_param = true; + break; + } + } else if (arg == "--log-disable") { log_set_target(stdout); @@ -2571,32 +2686,40 @@ static json format_partial_response( static json format_tokenizer_response(const std::vector<llama_token> &tokens) { - return json{ - {"tokens", tokens}}; + return json { + {"tokens", tokens} + }; } static json format_detokenized_response(std::string content) { - return json{ - {"content", content}}; + return json { + {"content", content} + }; } static void log_server_request(const httplib::Request &req, const httplib::Response &res) { + // skip GH copilot requests when using default port + if (req.path == "/v1/health" || req.path == "/v1/completions") + { + return; + } + LOG_INFO("request", { - {"remote_addr", req.remote_addr}, - {"remote_port", req.remote_port}, - {"status", res.status}, - {"method", req.method}, - {"path", req.path}, - {"params", req.params}, - }); + {"remote_addr", req.remote_addr}, + {"remote_port", req.remote_port}, + {"status", res.status}, + {"method", req.method}, + {"path", req.path}, + {"params", req.params}, + }); LOG_VERBOSE("request", { - {"request", req.body}, - {"response", res.body}, - }); + {"request", req.body}, + {"response", res.body}, + }); } struct token_translator @@ -2873,9 +2996,6 @@ int main(int argc, char **argv) // Set the base directory for serving static files svr.set_base_dir(sparams.public_path); - // to make it ctrl+clickable: - LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); - std::unordered_map<std::string, std::string> log_data; log_data["hostname"] = sparams.hostname; log_data["port"] = std::to_string(sparams.port); |