summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorsasha0552 <admin@sasha0552.org>2024-06-08 07:50:31 +0000
committerGitHub <noreply@github.com>2024-06-08 10:50:31 +0300
commit7a16ce7db2a74a223f0f3b9cee66d4539c5bce8f (patch)
treef1235f9d8ee68d4c39403bd2bca1078062cab2d7 /examples
parentda799b41891e34aac86ce4e173f9c4c0afd4fab3 (diff)
server : smart slot selection using Longest Common Prefix (#7728)
* server : Smart selection of available slot using Longest Common Substring * add usage * remove trailing whitespaces * Use Longest Common Prefix (LCP) instead of LCS * Rename argument
Diffstat (limited to 'examples')
-rw-r--r--examples/server/server.cpp134
-rw-r--r--examples/server/utils.hpp7
2 files changed, 126 insertions, 15 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 52822060..6ffaa8d9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -647,6 +647,9 @@ struct server_context {
server_metrics metrics;
+ // Necessary similarity of prompt for slot selection
+ float slot_prompt_similarity = 0.0f;
+
~server_context() {
if (ctx) {
llama_free(ctx);
@@ -795,24 +798,88 @@ struct server_context {
return prompt_tokens;
}
- server_slot * get_slot(int id) {
- int64_t t_last = ggml_time_us();
-
- server_slot * last_used = nullptr;
-
+ server_slot * get_slot_by_id(int id) {
for (server_slot & slot : slots) {
- if (slot.id == id && slot.available()) {
+ if (slot.id == id) {
return &slot;
}
+ }
+
+ return nullptr;
+ }
+
+ server_slot * get_available_slot(const std::string & prompt) {
+ server_slot * ret = nullptr;
+
+ // find the slot that has at least n% prompt similarity
+ if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
+ int max_lcp_len = 0;
+ float similarity = 0;
+
+ for (server_slot & slot : slots) {
+ // skip the slot if it is not available
+ if (!slot.available()) {
+ continue;
+ }
+
+ // skip the slot if it does not contains prompt
+ if (!slot.prompt.is_string()) {
+ continue;
+ }
+
+ // current slot's prompt
+ std::string slot_prompt = slot.prompt.get<std::string>();
+
+ // length of the current slot's prompt
+ int slot_prompt_len = slot_prompt.size();
+
+ // length of the Longest Common Prefix between the current slot's prompt and the input prompt
+ int lcp_len = common_part(slot_prompt, prompt);
+
+ // fraction of the common substring length compared to the current slot's prompt length
+ similarity = static_cast<float>(lcp_len) / slot_prompt_len;
+
+ // select the current slot if the criteria match
+ if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
+ max_lcp_len = lcp_len;
+ ret = &slot;
+ }
+ }
- // among all available slots, find the one that has been least recently used
- if (slot.available() && slot.t_last_used < t_last) {
- last_used = &slot;
- t_last = slot.t_last_used;
+ if (ret != nullptr) {
+ LOG_VERBOSE("selected slot by lcp similarity", {
+ {"id_slot", ret->id},
+ {"max_lcp_len", max_lcp_len},
+ {"similarity", similarity},
+ });
}
}
- return last_used;
+ // find the slot that has been least recently used
+ if (ret == nullptr) {
+ int64_t t_last = ggml_time_us();
+ for (server_slot & slot : slots) {
+ // skip the slot if it is not available
+ if (!slot.available()) {
+ continue;
+ }
+
+ // select the current slot if the criteria match
+ if (slot.t_last_used < t_last) {
+ t_last = slot.t_last_used;
+ ret = &slot;
+ }
+ }
+
+ if (ret != nullptr) {
+ LOG_VERBOSE("selected slot by lru", {
+ {"id_slot", ret->id},
+ {"t_last", t_last},
+ });
+ }
+ }
+
+ return ret;
}
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
@@ -1515,13 +1582,29 @@ struct server_context {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
{
- server_slot * slot = get_slot(json_value(task.data, "id_slot", -1));
+ int id_slot = json_value(task.data, "id_slot", -1);
+ std::string prompt = json_value(task.data, "prompt", std::string());
+
+ server_slot * slot;
+
+ if (id_slot != -1) {
+ slot = get_slot_by_id(id_slot);
+ } else {
+ slot = get_available_slot(prompt);
+ }
+
if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
+ if (!slot->available()) {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
if (task.data.contains("system_prompt")) {
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
@@ -1638,11 +1721,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_SAVE:
{
int id_slot = task.data.at("id_slot");
- server_slot * slot = get_slot(id_slot);
+ server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available()) {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us();
@@ -1673,11 +1762,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_RESTORE:
{
int id_slot = task.data.at("id_slot");
- server_slot * slot = get_slot(id_slot);
+ server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available()) {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
const int64_t t_start = ggml_time_us();
@@ -1715,11 +1810,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_ERASE:
{
int id_slot = task.data.at("id_slot");
- server_slot * slot = get_slot(id_slot);
+ server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available()) {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
// Erase token cache
const size_t n_erased = slot->cache_tokens.size();
@@ -2467,6 +2568,9 @@ int main(int argc, char ** argv) {
log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
}
+ // Necessary similarity of prompt for slot selection
+ ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
+
// load the model
if (!ctx_server.load_model(params)) {
state.store(SERVER_STATE_ERROR);
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index b7bfb41d..63fde9c9 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -253,6 +253,13 @@ static size_t common_part(const std::vector<llama_token> & a, const std::vector<
return i;
}
+static size_t common_part(const std::string & a, const std::string & b) {
+ size_t i;
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+
+ return i;
+}
+
static bool ends_with(const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
}