diff options
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 375 |
1 files changed, 364 insertions, 11 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index be23ad16..50f124b1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -29,6 +29,8 @@ #define SERVER_VERBOSE 1 #endif +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" + using json = nlohmann::json; struct server_params @@ -59,6 +61,10 @@ static bool server_verbose = false; #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +json oaicompat_completion_params_parse(const json &body); +std::string format_chatml(std::vector<json> messages); + + // // base64 utils (TODO: move to common in the future) // @@ -378,6 +384,9 @@ struct llama_client_slot bool stopped_word = false; bool stopped_limit = false; + bool oaicompat = false; + std::string oaicompat_model; + std::string stopping_word; // sampling @@ -477,7 +486,7 @@ struct llama_client_slot }; } - void print_timings() { + void print_timings() const { LOG_TEE("\n"); LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); @@ -609,6 +618,11 @@ struct llama_server_context std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const { + // TODO: currently, we tokenize using special tokens by default + // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) + // but it's better compared to completely ignoring ChatML and other chat templates + const bool TMP_FORCE_SPECIAL = true; + // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. std::vector<llama_token> prompt_tokens; @@ -624,12 +638,12 @@ struct llama_server_context std::vector<llama_token> p; if (first) { - p = ::llama_tokenize(ctx, s, add_bos); + p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); first = false; } else { - p = ::llama_tokenize(ctx, s, false); + p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); } prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); } @@ -646,7 +660,7 @@ struct llama_server_context else { auto s = json_prompt.template get<std::string>(); - prompt_tokens = ::llama_tokenize(ctx, s, add_bos); + prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); } return prompt_tokens; @@ -677,6 +691,14 @@ struct llama_server_context slot_params default_params; llama_sampling_params default_sparams; + if (data.count("__oaicompat") != 0) { + slot->oaicompat = true; + slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + } else { + slot->oaicompat = false; + slot->oaicompat_model = ""; + } + slot->params.stream = json_value(data, "stream", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false); slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); @@ -1170,6 +1192,12 @@ struct llama_server_context res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); } + if (slot.oaicompat) + { + res.result_json["oaicompat_token_ctr"] = slot.n_decoded; + res.result_json["model"] = slot.oaicompat_model; + } + queue_results.push_back(res); } @@ -1217,6 +1245,12 @@ struct llama_server_context res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs); } + if (slot.oaicompat) + { + res.result_json["oaicompat_token_ctr"] = slot.n_decoded; + res.result_json["model"] = slot.oaicompat_model; + } + queue_results.push_back(res); } @@ -1257,7 +1291,7 @@ struct llama_server_context task_server task; task.id = id_gen++; task.target_id = 0; - task.data = data; + task.data = std::move(data); task.infill_mode = infill; task.embedding_mode = embedding; task.type = COMPLETION_TASK; @@ -2180,6 +2214,233 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } + +static std::string random_string() +{ + static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + std::random_device rd; + std::mt19937 generator(rd()); + + std::string result(32, ' '); + + for (int i = 0; i < 32; ++i) { + result[i] = str[generator() % str.size()]; + } + + return result; +} + +static std::string gen_chatcmplid() +{ + std::stringstream chatcmplid; + chatcmplid << "chatcmpl-" << random_string(); + return chatcmplid.str(); +} + +std::string format_chatml(std::vector<json> messages) +{ + std::ostringstream chatml_msgs; + + for (auto it = messages.begin(); it != messages.end(); ++it) { + chatml_msgs << "<|im_start|>" + << json_value(*it, "role", std::string("user")) << '\n'; + chatml_msgs << json_value(*it, "content", std::string("")) + << "<|im_end|>\n"; + } + + chatml_msgs << "<|im_start|>assistant" << '\n'; + + return chatml_msgs.str(); +} + +/* llama.cpp completion api semantics */ +json oaicompat_completion_params_parse( + const json &body /* openai api json semantics */) +{ + json llama_params; + + llama_params["__oaicompat"] = true; + + // Map OpenAI parameters to llama.cpp parameters + llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' + llama_params["temperature"] = json_value(body, "temperature", 0.8); + llama_params["top_k"] = json_value(body, "top_k", 40); + llama_params["top_p"] = json_value(body, "top_p", 0.95); + llama_params["n_predict"] = json_value(body, "max_tokens", -1); + llama_params["logit_bias"] = json_value(body, "logit_bias",json::object()); + llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); + llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); + llama_params["seed"] = json_value(body, "seed", 0); + llama_params["stream"] = json_value(body, "stream", false); + llama_params["mirostat"] = json_value(body, "mirostat", false); + llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0); + llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0); + llama_params["penalize_nl"] = json_value(body, "penalize_nl", false); + llama_params["typical_p"] = json_value(body, "typical_p", 0.0); + llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0); + llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); + llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0); + + if (llama_params.count("grammar") != 0) { + llama_params["grammar"] = json_value(body, "grammar", json::object()); + } + + // Handle 'stop' field + if (body["stop"].is_null()) { + llama_params["stop"] = json::array({}); + } else if (body["stop"].is_string()) { + llama_params["stop"] = json::array({body["stop"].get<std::string>()}); + } else { + llama_params["stop"] = json_value(body, "stop", json::array()); + } + + // Ensure there is ChatML-specific end sequence among stop words + llama_params["stop"].push_back("<|im_end|>"); + + return llama_params; +} + +static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false) +{ + json result = response.result_json; + + bool stopped_word = result.count("stopped_word") != 0; + bool stopped_eos = json_value(result, "stopped_eos", false); + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason = "length"; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + + json choices = + streaming ? json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}) + : json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{{"content", content}, + {"role", "assistant"}}}}}); + + std::time_t t = std::time(0); + + json res = + json{{"choices", choices}, + {"created", t}, + {"model", + json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, + {"usage", + json{{"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, + {"id", gen_chatcmplid()}}; + + if (server_verbose) { + res["__verbose"] = result; + } + + if (result.contains("completion_probabilities")) { + res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); + } + + return res; +} + +// return value is vector as there is one case where we might need to generate two responses +static std::vector<json> format_partial_response_oaicompat(const task_result &response) { + json result = response.result_json; + + if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { + return std::vector<json>({response.result_json}); + } + + bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; + std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + + bool stopped_word = json_value(result, "stopped_word", false); + bool stopped_eos = json_value(result, "stopped_eos", false); + bool stopped_limit = json_value(result, "stopped_limit", false); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + if (stopped_limit) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector<json>({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector<json>({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json{{"choices", choices}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector<json>({ret}); +} + static json format_partial_response( llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs ) { @@ -2356,9 +2617,9 @@ int main(int argc, char **argv) task_result result = llama.next_result(task_id); if (!result.error) { const std::string str = - "data: " + - result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; + "data: " + + result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; LOG_VERBOSE("data stream", { { "to_send", str } }); @@ -2371,9 +2632,9 @@ int main(int argc, char **argv) } } else { const std::string str = - "error: " + - result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; + "error: " + + result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; LOG_VERBOSE("data stream", { { "to_send", str } }); @@ -2398,6 +2659,98 @@ int main(int argc, char **argv) } }); + + + svr.Get("/v1/models", [¶ms](const httplib::Request&, httplib::Response& res) + { + std::time_t t = std::time(0); + + json models = { + {"object", "list"}, + {"data", { + { + {"id", params.model_alias}, + {"object", "model"}, + {"created", t}, + {"owned_by", "llamacpp"} + }, + }} + }; + + res.set_content(models.dump(), "application/json"); + }); + + // TODO: add mount point without "/v1" prefix -- how? + svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res) + { + json data = oaicompat_completion_params_parse(json::parse(req.body)); + + const int task_id = llama.request_completion(data, false, false); + + if (!json_value(data, "stream", false)) { + std::string completion_text; + task_result result = llama.next_result(task_id); + + if (!result.error && result.stop) { + json oaicompat_result = format_final_response_oaicompat(data, result); + + res.set_content(oaicompat_result.dump(-1, ' ', false, + json::error_handler_t::replace), + "application/json"); + } else { + res.status = 500; + res.set_content(result.result_json["content"], "text/plain"); + return; + } + } else { + const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) { + while (true) { + task_result llama_result = llama.next_result(task_id); + if (!llama_result.error) { + std::vector<json> result_array = format_partial_response_oaicompat( llama_result); + + for (auto it = result_array.begin(); it != result_array.end(); ++it) + { + if (!it->empty()) { + const std::string str = + "data: " + + it->dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + return false; + } + } + } + if (llama_result.stop) { + break; + } + } else { + const std::string str = + "error: " + + llama_result.result_json.dump(-1, ' ', false, + json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", {{"to_send", str}}); + if (!sink.write(str.c_str(), str.size())) { + return false; + } + break; + } + } + sink.done(); + return true; + }; + + auto on_complete = [task_id, &llama](bool) { + // cancel request + llama.request_cancel(task_id); + }; + + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); + } + }); + svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) { json data = json::parse(req.body); |