diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-03-07 11:41:53 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-07 11:41:53 +0200 |
commit | 2002bc96bf2cbf5ab981a17d7e994d817c9801f5 (patch) | |
tree | e96b820fcd091c19ebbbae353c5358d9978cc830 /examples/server/utils.hpp | |
parent | ceca1aef0738b57951cd12c603c3477e75312dec (diff) |
server : refactor (#5882)
* server : refactoring (wip)
* server : remove llava/clip objects from build
* server : fix empty prompt handling + all slots idle logic
* server : normalize id vars
* server : code style
* server : simplify model chat template validation
* server : code style
* server : minor
* llama : llama_chat_apply_template support null buf
* server : do not process embedding requests when disabled
* server : reorganize structs and enums + naming fixes
* server : merge oai.hpp in utils.hpp
* server : refactor system prompt update at start
* server : disable cached prompts with self-extend
* server : do not process more than n_batch tokens per iter
* server: tests: embeddings use a real embeddings model (#5908)
* server, tests : bump batch to fit 1 embedding prompt
* server: tests: embeddings fix build type Debug is randomly failing (#5911)
* server: tests: embeddings, use different KV Cache size
* server: tests: embeddings, fixed prompt do not exceed n_batch, increase embedding timeout, reduce number of concurrent embeddings
* server: tests: embeddings, no need to wait for server idle as it can timout
* server: refactor: clean up http code (#5912)
* server : avoid n_available var
ggml-ci
* server: refactor: better http codes
* server : simplify json parsing + add comment about t_last
* server : rename server structs
* server : allow to override FQDN in tests
ggml-ci
* server : add comments
---------
Co-authored-by: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Diffstat (limited to 'examples/server/utils.hpp')
-rw-r--r-- | examples/server/utils.hpp | 703 |
1 files changed, 307 insertions, 396 deletions
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index b6e49d8b..df0a2778 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -1,15 +1,16 @@ #pragma once -#include <string> -#include <vector> -#include <set> -#include <mutex> -#include <condition_variable> -#include <unordered_map> +#include "llama.h" +#include "common.h" #include "json.hpp" -#include "../llava/clip.h" +#include <string> +#include <vector> +#include <sstream> +#include <random> + +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" using json = nlohmann::json; @@ -37,83 +38,35 @@ extern bool server_log_json; #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) -enum server_state { - SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet - SERVER_STATE_READY, // Server is ready and model is loaded - SERVER_STATE_ERROR // An error occurred, load_model failed -}; - -enum task_type { - TASK_TYPE_COMPLETION, - TASK_TYPE_CANCEL, - TASK_TYPE_NEXT_RESPONSE, - TASK_TYPE_METRICS -}; - -struct task_server { - int id = -1; // to be filled by llama_server_queue - int target_id; - task_type type; - json data; - bool infill_mode = false; - bool embedding_mode = false; - int multitask_id = -1; -}; - -struct task_result { - int id; - int multitask_id = -1; - bool stop; - bool error; - json result_json; -}; - -struct task_multi { - int id; - std::set<int> subtasks_remaining{}; - std::vector<task_result> results{}; -}; - -// completion token output with probabilities -struct completion_token_output { - struct token_prob - { - llama_token tok; - float prob; - }; - - std::vector<token_prob> probs; - llama_token tok; - std::string text_to_send; -}; - -struct token_translator { - llama_context * ctx; - std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } - std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } -}; +template <typename T> +static T json_value(const json &body, const std::string &key, const T &default_value) { + // Fallback null to default value + return body.contains(key) && !body.at(key).is_null() + ? body.value(key, default_value) + : default_value; +} static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { std::stringstream ss_tid; ss_tid << std::this_thread::get_id(); json log = nlohmann::ordered_json{ - {"tid", ss_tid.str()}, + {"tid", ss_tid.str()}, {"timestamp", time(nullptr)}, }; if (server_log_json) { - log.merge_patch( - { - {"level", level}, - {"function", function}, - {"line", line}, - {"msg", message}, - }); + log.merge_patch( { + {"level", level}, + {"function", function}, + {"line", line}, + {"msg", message}, + }); + if (!extra.empty()) { log.merge_patch(extra); } - std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; + printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); } else { char buf[1024]; snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); @@ -136,22 +89,13 @@ static inline void server_log(const char *level, const char *function, int line, } // -// server utils +// chat template utils // -template <typename T> -static T json_value(const json &body, const std::string &key, const T &default_value) { - // Fallback null to default value - return body.contains(key) && !body.at(key).is_null() - ? body.value(key, default_value) - : default_value; -} - // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid inline bool verify_custom_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; - std::vector<char> buf(1); - int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size()); + int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); return res >= 0; } @@ -163,7 +107,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri std::vector<llama_chat_message> chat(messages.size()); for (size_t i = 0; i < messages.size(); ++i) { - auto &curr_msg = messages[i]; + const auto & curr_msg = messages[i]; str[i*2 + 0] = json_value(curr_msg, "role", std::string("")); str[i*2 + 1] = json_value(curr_msg, "content", std::string("")); alloc_size += str[i*2 + 1].length(); @@ -183,262 +127,14 @@ inline std::string format_chat(const struct llama_model * model, const std::stri res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); } - std::string formatted_chat(buf.data(), res); + const std::string formatted_chat(buf.data(), res); + LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); return formatted_chat; } // -// work queue utils -// - -struct llama_server_queue { - int id = 0; - std::mutex mutex_tasks; - bool running; - // queues - std::vector<task_server> queue_tasks; - std::vector<task_server> queue_tasks_deferred; - std::vector<task_multi> queue_multitasks; - std::condition_variable condition_tasks; - // callback functions - std::function<void(task_server&)> callback_new_task; - std::function<void(task_multi&)> callback_finish_multitask; - std::function<void(void)> callback_run_slots; - - // Add a new task to the end of the queue - int post(task_server task) { - std::unique_lock<std::mutex> lock(mutex_tasks); - if (task.id == -1) { - task.id = id++; - LOG_VERBOSE("new task id", {{"new_id", task.id}}); - } - queue_tasks.push_back(std::move(task)); - condition_tasks.notify_one(); - return task.id; - } - - // Add a new task, but defer until one slot is available - void defer(task_server task) { - std::unique_lock<std::mutex> lock(mutex_tasks); - queue_tasks_deferred.push_back(std::move(task)); - } - - // Get the next id for creating anew task - int get_new_id() { - std::unique_lock<std::mutex> lock(mutex_tasks); - int new_id = id++; - LOG_VERBOSE("new task id", {{"new_id", new_id}}); - return new_id; - } - - // Register function to process a new task - void on_new_task(std::function<void(task_server&)> callback) { - callback_new_task = callback; - } - - // Register function to process a multitask when it is finished - void on_finish_multitask(std::function<void(task_multi&)> callback) { - callback_finish_multitask = callback; - } - - // Register the function to be called when all slots data is ready to be processed - void on_run_slots(std::function<void(void)> callback) { - callback_run_slots = callback; - } - - // Call when the state of one slot is changed - void notify_slot_changed() { - // move deferred tasks back to main loop - std::unique_lock<std::mutex> lock(mutex_tasks); - for (auto & task : queue_tasks_deferred) { - queue_tasks.push_back(std::move(task)); - } - queue_tasks_deferred.clear(); - } - - // end the start_loop routine - void terminate() { - { - std::unique_lock<std::mutex> lock(mutex_tasks); - running = false; - } - condition_tasks.notify_all(); - } - - /** - * Main loop consists of these steps: - * - Wait until a new task arrives - * - Process the task (i.e. maybe copy data into slot) - * - Check if multitask is finished - * - Run all slots - */ - void start_loop() { - running = true; - while (true) { - LOG_VERBOSE("new task may arrive", {}); - { - while (true) - { - std::unique_lock<std::mutex> lock(mutex_tasks); - if (queue_tasks.empty()) { - lock.unlock(); - break; - } - task_server task = queue_tasks.front(); - queue_tasks.erase(queue_tasks.begin()); - lock.unlock(); - LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); - callback_new_task(task); - } - LOG_VERBOSE("update_multitasks", {}); - // check if we have any finished multitasks - auto queue_iterator = queue_multitasks.begin(); - while (queue_iterator != queue_multitasks.end()) - { - if (queue_iterator->subtasks_remaining.empty()) - { - // all subtasks done == multitask is done - task_multi current_multitask = *queue_iterator; - callback_finish_multitask(current_multitask); - // remove this multitask - queue_iterator = queue_multitasks.erase(queue_iterator); - } - else - { - ++queue_iterator; - } - } - // all tasks in the current loop is processed, slots data is now ready - LOG_VERBOSE("callback_run_slots", {}); - callback_run_slots(); - } - LOG_VERBOSE("wait for new task", {}); - // wait for new task - { - std::unique_lock<std::mutex> lock(mutex_tasks); - if (queue_tasks.empty()) { - if (!running) { - LOG_VERBOSE("ending start_loop", {}); - return; - } - condition_tasks.wait(lock, [&]{ - return (!queue_tasks.empty() || !running); - }); - } - } - } - } - - // - // functions to manage multitasks - // - - // add a multitask by specifying the id of all subtask (subtask is a task_server) - void add_multitask(int multitask_id, std::vector<int>& sub_ids) - { - std::lock_guard<std::mutex> lock(mutex_tasks); - task_multi multi; - multi.id = multitask_id; - std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); - queue_multitasks.push_back(multi); - } - - // updatethe remaining subtasks, while appending results to multitask - void update_multitask(int multitask_id, int subtask_id, task_result& result) - { - std::lock_guard<std::mutex> lock(mutex_tasks); - for (auto& multitask : queue_multitasks) - { - if (multitask.id == multitask_id) - { - multitask.subtasks_remaining.erase(subtask_id); - multitask.results.push_back(result); - } - } - } -}; - -struct llama_server_response { - typedef std::function<void(int, int, task_result&)> callback_multitask_t; - callback_multitask_t callback_update_multitask; - // for keeping track of all tasks waiting for the result - std::set<int> waiting_task_ids; - // the main result queue - std::vector<task_result> queue_results; - std::mutex mutex_results; - std::condition_variable condition_results; - - // add the task_id to the list of tasks waiting for response - void add_waiting_task_id(int task_id) { - LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); - std::unique_lock<std::mutex> lock(mutex_results); - waiting_task_ids.insert(task_id); - } - - // when the request is finished, we can remove task associated with it - void remove_waiting_task_id(int task_id) { - LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); - std::unique_lock<std::mutex> lock(mutex_results); - waiting_task_ids.erase(task_id); - } - - // This function blocks the thread until there is a response for this task_id - task_result recv(int task_id) { - while (true) - { - std::unique_lock<std::mutex> lock(mutex_results); - condition_results.wait(lock, [&]{ - return !queue_results.empty(); - }); - - for (int i = 0; i < (int) queue_results.size(); i++) - { - if (queue_results[i].id == task_id) - { - assert(queue_results[i].multitask_id == -1); - task_result res = queue_results[i]; - queue_results.erase(queue_results.begin() + i); - return res; - } - } - } - - // should never reach here - } - - // Register the function to update multitask - void on_multitask_update(callback_multitask_t callback) { - callback_update_multitask = callback; - } - - // Send a new result to a waiting task_id - void send(task_result result) { - std::unique_lock<std::mutex> lock(mutex_results); - LOG_VERBOSE("send new result", {{"task_id", result.id}}); - for (auto& task_id : waiting_task_ids) { - // LOG_TEE("waiting task id %i \n", task_id); - // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result - if (result.multitask_id == task_id) - { - LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}}); - callback_update_multitask(task_id, result.id, result); - continue; - } - - if (result.id == task_id) - { - LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}}); - queue_results.push_back(result); - condition_results.notify_all(); - return; - } - } - } -}; - -// // base64 utils (TODO: move to common in the future) // @@ -447,13 +143,11 @@ static const std::string base64_chars = "abcdefghijklmnopqrstuvwxyz" "0123456789+/"; -static inline bool is_base64(uint8_t c) -{ +static inline bool is_base64(uint8_t c) { return (isalnum(c) || (c == '+') || (c == '/')); } -static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) -{ +static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) { int i = 0; int j = 0; int in_ = 0; @@ -465,13 +159,10 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str std::vector<uint8_t> ret; - while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) - { + while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { char_array_4[i++] = encoded_string[in_]; in_++; - if (i == 4) - { - for (i = 0; i <4; i++) - { + if (i == 4) { + for (i = 0; i < 4; i++) { char_array_4[i] = base64_chars.find(char_array_4[i]); } @@ -479,23 +170,20 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - for (i = 0; (i < 3); i++) - { + for (i = 0; (i < 3); i++) { ret.push_back(char_array_3[i]); } + i = 0; } } - if (i) - { - for (j = i; j <4; j++) - { + if (i) { + for (j = i; j < 4; j++) { char_array_4[j] = 0; } - for (j = 0; j <4; j++) - { + for (j = 0; j < 4; j++) { char_array_4[j] = base64_chars.find(char_array_4[j]); } @@ -503,8 +191,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - for (j = 0; (j < i - 1); j++) - { + for (j = 0; j < i - 1; j++) { ret.push_back(char_array_3[j]); } } @@ -516,8 +203,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str // random string / id // -static std::string random_string() -{ +static std::string random_string() { static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); std::random_device rd; @@ -532,10 +218,10 @@ static std::string random_string() return result; } -static std::string gen_chatcmplid() -{ +static std::string gen_chatcmplid() { std::stringstream chatcmplid; chatcmplid << "chatcmpl-" << random_string(); + return chatcmplid.str(); } @@ -543,91 +229,316 @@ static std::string gen_chatcmplid() // other common utils // -static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b) -{ +static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) { size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) - { - } + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + return i; } -static bool ends_with(const std::string &str, const std::string &suffix) -{ - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +static bool ends_with(const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } -static size_t find_partial_stop_string(const std::string &stop, - const std::string &text) -{ - if (!text.empty() && !stop.empty()) - { +static size_t find_partial_stop_string(const std::string &stop, const std::string &text) { + if (!text.empty() && !stop.empty()) { const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) - { - if (stop[char_index] == text_last_char) - { + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { + if (stop[char_index] == text_last_char) { const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) - { + if (ends_with(text, current_partial)) { return text.size() - char_index - 1; } } } } + return std::string::npos; } // TODO: reuse llama_detokenize template <class Iter> -static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) -{ +static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; - for (; begin != end; ++begin) - { + for (; begin != end; ++begin) { ret += llama_token_to_piece(ctx, *begin); } + return ret; } // format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) -{ +static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); + // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) - { + if (out.size() == 1 && (out[0] & 0x80) == 0x80) { std::stringstream ss; ss << std::hex << (out[0] & 0xff); std::string res(ss.str()); out = "byte: \\x" + res; } + return out; } +struct completion_token_output { + llama_token tok; + std::string text_to_send; + + struct token_prob { + llama_token tok; + float prob; + }; + + std::vector<token_prob> probs; +}; + // convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs) -{ +static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) { json out = json::array(); - for (const auto &prob : probs) - { + + for (const auto & prob : probs) { json probs_for_token = json::array(); - for (const auto &p : prob.probs) - { - std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json - { + + for (const auto & p : prob.probs) { + const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); + probs_for_token.push_back(json { {"tok_str", tok_str}, {"prob", p.prob}, }); } - std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json{ + + const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); + out.push_back(json { {"content", tok_str}, {"probs", probs_for_token}, }); } + return out; } + +// +// OAI utils +// + +static json oaicompat_completion_params_parse( + const struct llama_model * model, + const json & body, /* openai api json semantics */ + const std::string & chat_template) { + json llama_params; + + llama_params["__oaicompat"] = true; + + // Map OpenAI parameters to llama.cpp parameters + // + // For parameters that are defined by the OpenAI documentation (e.g. + // temperature), we explicitly specify OpenAI's intended default; we + // need to do that because sometimes OpenAI disagrees with llama.cpp + // + // https://platform.openai.com/docs/api-reference/chat/create + llama_sampling_params default_sparams; + llama_params["model"] = json_value(body, "model", std::string("unknown")); + llama_params["prompt"] = format_chat(model, chat_template, body["messages"]); + llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); + llama_params["temperature"] = json_value(body, "temperature", 0.0); + llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); + llama_params["top_p"] = json_value(body, "top_p", 1.0); + llama_params["n_predict"] = json_value(body, "max_tokens", -1); + llama_params["logit_bias"] = json_value(body, "logit_bias", json::object()); + llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); + llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); + llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); + llama_params["stream"] = json_value(body, "stream", false); + llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat); + llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); + llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); + llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl); + llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p); + llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); + llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); + llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z); + + if (body.count("grammar") != 0) { + llama_params["grammar"] = json_value(body, "grammar", json::object()); + } + + // Handle 'stop' field + if (body.contains("stop") && body["stop"].is_string()) { + llama_params["stop"] = json::array({body["stop"].get<std::string>()}); + } else { + llama_params["stop"] = json_value(body, "stop", json::array()); + } + + // Ensure there is ChatML-specific end sequence among stop words + llama_params["stop"].push_back("<|im_end|>"); + + return llama_params; +} + +static json format_final_response_oaicompat(const json & request, json result, bool streaming = false) { + bool stopped_word = result.count("stopped_word") != 0; + bool stopped_eos = json_value(result, "stopped_eos", false); + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason = "length"; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + + json choices = + streaming ? json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}) + : json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{{"content", content}, + {"role", "assistant"}}}}}); + + std::time_t t = std::time(0); + + json res = json { + {"choices", choices}, + {"created", t}, + {"model", + json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, + {"usage", json { + {"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens} + }}, + {"id", gen_chatcmplid()} + }; + + if (server_verbose) { + res["__verbose"] = result; + } + + if (result.contains("completion_probabilities")) { + res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); + } + + return res; +} + +// return value is vector as there is one case where we might need to generate two responses +static std::vector<json> format_partial_response_oaicompat(json result) { + if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { + return std::vector<json>({result}); + } + + bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; + std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + + bool stopped_word = json_value(result, "stopped_word", false); + bool stopped_eos = json_value(result, "stopped_eos", false); + bool stopped_limit = json_value(result, "stopped_limit", false); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + if (stopped_limit) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector<json>({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector<json>({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json { + {"choices", choices}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"} + }; + + return std::vector<json>({ret}); +} + +static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { + json res = json { + {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", "list"}, + {"usage", json { + {"prompt_tokens", 0}, + {"total_tokens", 0} + }}, + {"data", embeddings} + }; + + return res; +} + +static json format_tokenizer_response(const std::vector<llama_token> & tokens) { + return json { + {"tokens", tokens} + }; +} + +static json format_detokenized_response(const std::string & content) { + return json { + {"content", content} + }; +} |