summaryrefslogtreecommitdiff
path: root/examples/server/utils.hpp
diff options
context:
space:
mode:
authorXuan Son Nguyen <thichthat@gmail.com>2024-02-29 21:42:11 +0100
committerGitHub <noreply@github.com>2024-02-29 21:42:11 +0100
commit052051d8ae4639a1c3c61e7da3237bcc572469d4 (patch)
treebfe83ce18cc75d21579b10d45d183d96bee445cf /examples/server/utils.hpp
parentd5ab29757ebc59a30f03e408294ec20628a6374e (diff)
Server: normalize naming (#5779)
* server: normalize naming * fix spacing
Diffstat (limited to 'examples/server/utils.hpp')
-rw-r--r--examples/server/utils.hpp186
1 files changed, 122 insertions, 64 deletions
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d7abd7cb..d98541f2 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -37,10 +37,6 @@ extern bool server_log_json;
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-//
-// parallel
-//
-
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
@@ -78,51 +74,8 @@ struct task_multi {
std::vector<task_result> results{};
};
-// TODO: can become bool if we can't find use of more states
-enum slot_state
-{
- IDLE,
- PROCESSING,
-};
-
-enum slot_command
-{
- NONE,
- LOAD_PROMPT,
- RELEASE,
-};
-
-struct slot_params
-{
- bool stream = true;
- bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
-
- uint32_t seed = -1; // RNG seed
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
- int32_t n_predict = -1; // new tokens to predict
-
- std::vector<std::string> antiprompt;
-
- json input_prefix;
- json input_suffix;
-};
-
-struct slot_image
-{
- int32_t id;
-
- bool request_encode_image = false;
- float * image_embedding = nullptr;
- int32_t image_tokens = 0;
-
- clip_image_u8 * img_data;
-
- std::string prefix_prompt; // before of this image
-};
-
// completion token output with probabilities
-struct completion_token_output
-{
+struct completion_token_output {
struct token_prob
{
llama_token tok;
@@ -134,8 +87,13 @@ struct completion_token_output
std::string text_to_send;
};
-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
-{
+struct token_translator {
+ llama_context * ctx;
+ std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
+ std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
+};
+
+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
std::stringstream ss_tid;
ss_tid << std::this_thread::get_id();
json log = nlohmann::ordered_json{
@@ -183,8 +141,7 @@ static inline void server_log(const char *level, const char *function, int line,
//
template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value)
-{
+static T json_value(const json &body, const std::string &key, const T &default_value) {
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
@@ -200,8 +157,7 @@ inline bool verify_custom_template(const std::string & tmpl) {
}
// Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
-{
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
size_t alloc_size = 0;
// vector holding all allocated string to be passed to llama_chat_apply_template
std::vector<std::string> str(messages.size() * 2);
@@ -250,7 +206,7 @@ struct llama_server_queue {
// callback functions
std::function<void(task_server&)> callback_new_task;
std::function<void(task_multi&)> callback_finish_multitask;
- std::function<void(void)> callback_all_task_finished;
+ std::function<void(void)> callback_run_slots;
// Add a new task to the end of the queue
int post(task_server task) {
@@ -283,14 +239,14 @@ struct llama_server_queue {
callback_new_task = callback;
}
- // Register function to process a multitask
+ // Register function to process a multitask when it is finished
void on_finish_multitask(std::function<void(task_multi&)> callback) {
callback_finish_multitask = callback;
}
- // Register the function to be called when the batch of tasks is finished
- void on_all_tasks_finished(std::function<void(void)> callback) {
- callback_all_task_finished = callback;
+ // Register the function to be called when all slots data is ready to be processed
+ void on_run_slots(std::function<void(void)> callback) {
+ callback_run_slots = callback;
}
// Call when the state of one slot is changed
@@ -312,7 +268,13 @@ struct llama_server_queue {
condition_tasks.notify_all();
}
- // Start the main loop.
+ /**
+ * Main loop consists of these steps:
+ * - Wait until a new task arrives
+ * - Process the task (i.e. maybe copy data into slot)
+ * - Check if multitask is finished
+ * - Run all slots
+ */
void start_loop() {
running = true;
while (true) {
@@ -331,8 +293,8 @@ struct llama_server_queue {
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
callback_new_task(task);
}
- LOG_VERBOSE("callback_all_task_finished", {});
- // process and update all the multitasks
+ LOG_VERBOSE("update_multitasks", {});
+ // check if we have any finished multitasks
auto queue_iterator = queue_multitasks.begin();
while (queue_iterator != queue_multitasks.end())
{
@@ -349,8 +311,9 @@ struct llama_server_queue {
++queue_iterator;
}
}
- // all tasks in the current loop is finished
- callback_all_task_finished();
+ // all tasks in the current loop is processed, slots data is now ready
+ LOG_VERBOSE("callback_run_slots", {});
+ callback_run_slots();
}
LOG_VERBOSE("wait for new task", {});
// wait for new task
@@ -408,12 +371,14 @@ struct llama_server_response {
std::mutex mutex_results;
std::condition_variable condition_results;
+ // add the task_id to the list of tasks waiting for response
void add_waiting_task_id(int task_id) {
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(task_id);
}
+ // when the request is finished, we can remove task associated with it
void remove_waiting_task_id(int task_id) {
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
std::unique_lock<std::mutex> lock(mutex_results);
@@ -574,3 +539,96 @@ static std::string gen_chatcmplid()
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
}
+
+//
+// other common utils
+//
+
+static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
+{
+ size_t i;
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+ {
+ }
+ return i;
+}
+
+static bool ends_with(const std::string &str, const std::string &suffix)
+{
+ return str.size() >= suffix.size() &&
+ 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+
+static size_t find_partial_stop_string(const std::string &stop,
+ const std::string &text)
+{
+ if (!text.empty() && !stop.empty())
+ {
+ const char text_last_char = text.back();
+ for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+ {
+ if (stop[char_index] == text_last_char)
+ {
+ const std::string current_partial = stop.substr(0, char_index + 1);
+ if (ends_with(text, current_partial))
+ {
+ return text.size() - char_index - 1;
+ }
+ }
+ }
+ }
+ return std::string::npos;
+}
+
+// TODO: reuse llama_detokenize
+template <class Iter>
+static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+{
+ std::string ret;
+ for (; begin != end; ++begin)
+ {
+ ret += llama_token_to_piece(ctx, *begin);
+ }
+ return ret;
+}
+
+// format incomplete utf-8 multibyte character for output
+static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
+{
+ std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+ // if the size is 1 and first bit is 1, meaning it's a partial character
+ // (size > 1 meaning it's already a known token)
+ if (out.size() == 1 && (out[0] & 0x80) == 0x80)
+ {
+ std::stringstream ss;
+ ss << std::hex << (out[0] & 0xff);
+ std::string res(ss.str());
+ out = "byte: \\x" + res;
+ }
+ return out;
+}
+
+// convert a vector of completion_token_output to json
+static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
+{
+ json out = json::array();
+ for (const auto &prob : probs)
+ {
+ json probs_for_token = json::array();
+ for (const auto &p : prob.probs)
+ {
+ std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
+ probs_for_token.push_back(json
+ {
+ {"tok_str", tok_str},
+ {"prob", p.prob},
+ });
+ }
+ std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+ out.push_back(json{
+ {"content", tok_str},
+ {"probs", probs_for_token},
+ });
+ }
+ return out;
+}