diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-03-02 22:00:14 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-02 22:00:14 +0100 |
commit | 9731134296af3a6839cd682e51d9c2109a871de5 (patch) | |
tree | 882db21742d552ee948d1b5db013f02bf35ff8fa /examples/server/server.cpp | |
parent | 4a6e2d6142ab815c964924896891e9ab3e050632 (diff) |
server: tests: passkey challenge / self-extend with context shift demo (#5832)
* server: tests: add models endpoint scenario
* server: /v1/models add some metadata
* server: tests: add debug field in context before scenario
* server: tests: download model from HF, add batch size
* server: tests: add passkey test
* server: tests: add group attention params
* server: do not truncate prompt tokens if self-extend through group attention is enabled
* server: logs: do not truncate log values
* server: tests - passkey - first good working value of nga
* server: tests: fix server timeout
* server: tests: fix passkey, add doc, fix regex content matching, fix timeout
* server: tests: fix regex content matching
* server: tests: schedule slow tests on master
* server: metrics: fix when no prompt processed
* server: tests: self-extend add llama-2-7B and Mixtral-8x7B-v0.1
* server: tests: increase timeout for completion
* server: tests: keep only the PHI-2 test
* server: tests: passkey add a negative test
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 46 |
1 files changed, 31 insertions, 15 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2b2f4a0f..52daf9e7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -441,8 +441,8 @@ struct llama_server_context const int ga_w = params.grp_attn_w; if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT + GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT @@ -1709,8 +1709,8 @@ struct llama_server_context } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - // if input prompt is too big, truncate it - if (slot.n_prompt_tokens >= slot.n_ctx) + // if input prompt is too big, truncate it, if group attention self-extend is disabled + if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; @@ -1785,9 +1785,11 @@ struct llama_server_context } LOG_INFO("slot progression", { - { "slot_id", slot.id }, - { "task_id", slot.task_id }, - { "n_past", slot.n_past }, + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "n_past", slot.n_past }, + { "n_past_se", slot.n_past_se }, + { "ga_i", slot.ga_i }, { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed } }); } @@ -2001,6 +2003,17 @@ struct llama_server_context LOG_VERBOSE("slots updated", {}); return true; } + + json model_meta() { + return json{ + {"vocab_type", llama_vocab_type(model)}, + {"n_vocab", llama_n_vocab(model)}, + {"n_ctx_train", llama_n_ctx_train(model)}, + {"n_embd", llama_n_embd(model)}, + {"n_params", llama_model_n_params(model)}, + {"size", llama_model_size(model)}, + }; + } }; static void server_print_usage(const char *argv0, const gpt_params ¶ms, @@ -2911,9 +2924,10 @@ int main(int argc, char **argv) for (const auto& metric_def : metrics_def) { std::string name = metric_def["name"]; std::string help = metric_def["help"]; - prometheus << "# HELP llamacpp:" << name << " " << help << "\n" - << "# TYPE llamacpp:" << name << " " << type << "\n" - << "llamacpp:" << name << " " << metric_def["value"] << "\n"; + auto value = json_value(metric_def, "value", 0); + prometheus << "# HELP llamacpp:" << name << " " << help << "\n" + << "# TYPE llamacpp:" << name << " " << type << "\n" + << "llamacpp:" << name << " " << value << "\n"; } } @@ -2994,6 +3008,7 @@ int main(int argc, char **argv) state.store(SERVER_STATE_READY); LOG_INFO("model loaded", {}); } + const auto model_meta = llama.model_meta(); if (sparams.chat_template.empty()) { // custom chat template is not supplied // check if the template comes with the model is supported by us @@ -3143,7 +3158,7 @@ int main(int argc, char **argv) } }); - svr.Get("/v1/models", [¶ms](const httplib::Request& req, httplib::Response& res) + svr.Get("/v1/models", [¶ms, &model_meta](const httplib::Request& req, httplib::Response& res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); std::time_t t = std::time(0); @@ -3152,10 +3167,11 @@ int main(int argc, char **argv) {"object", "list"}, {"data", { { - {"id", params.model_alias}, - {"object", "model"}, - {"created", t}, - {"owned_by", "llamacpp"} + {"id", params.model_alias}, + {"object", "model"}, + {"created", t}, + {"owned_by", "llamacpp"}, + {"meta", model_meta} }, }} }; |