summaryrefslogtreecommitdiff
path: root/examples/server
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-11-16 19:14:37 -0700
committerGitHub <noreply@github.com>2023-11-16 19:14:37 -0700
commit91f6499393d2d999331fbfdba47a7f8b9f913f0d (patch)
tree27caf3ad0b9cec979bb5ed3317b5334bdcd9470c /examples/server
parent8da46278e1a57107591653275f8e03a281de94f0 (diff)
Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040)
* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode. * Respect add_bos_token GGUF metadata value * gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
Diffstat (limited to 'examples/server')
-rw-r--r--examples/server/server.cpp9
1 files changed, 6 insertions, 3 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 46862a84..bb87b532 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -501,6 +501,7 @@ struct llama_server_context
bool multimodal = false;
bool clean_kv_cache = true;
bool all_slots_are_idle = false;
+ bool add_bos_token = true;
int32_t id_gen;
int32_t n_ctx; // total context for all clients / slots
@@ -573,6 +574,8 @@ struct llama_server_context
n_ctx = llama_n_ctx(ctx);
+ add_bos_token = llama_should_add_bos_token(model);
+
return true;
}
@@ -864,7 +867,7 @@ struct llama_server_context
}
void update_system_prompt() {
- system_tokens = ::llama_tokenize(ctx, system_prompt, true);
+ system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
llama_batch_clear(batch);
@@ -1552,7 +1555,7 @@ struct llama_server_context
}
else
{
- prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
+ prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
}
slot.num_prompt_tokens = prompt_tokens.size();
@@ -1629,7 +1632,7 @@ struct llama_server_context
const bool has_images = process_images(slot);
// process the prefix of first image
- std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
+ std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
{
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);