summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
Diffstat (limited to 'examples')
-rw-r--r--examples/batched-bench/batched-bench.cpp13
-rw-r--r--examples/batched/batched.cpp3
-rw-r--r--examples/parallel/parallel.cpp20
-rw-r--r--examples/perplexity/perplexity.cpp9
-rw-r--r--examples/server/server.cpp51
5 files changed, 65 insertions, 31 deletions
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 19aff18a..dff6c68e 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -105,6 +105,9 @@ int main(int argc, char ** argv) {
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ // ensure enough sequences are available
+ ctx_params.n_parallel = *std::max_element(n_pl.begin(), n_pl.end());
+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
@@ -174,10 +177,10 @@ int main(int argc, char ** argv) {
llama_batch_clear(batch);
- const int n_tokens = is_pp_shared ? pp : pl*pp;
-
- for (int i = 0; i < n_tokens; ++i) {
- llama_batch_add(batch, 0, i, { 0 }, false);
+ for (int i = 0; i < pp; ++i) {
+ for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
+ llama_batch_add(batch, 0, i, { j }, false);
+ }
}
batch.logits[batch.n_tokens - 1] = true;
@@ -192,7 +195,7 @@ int main(int argc, char ** argv) {
if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) {
- llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
}
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 9be7eb56..dde4d5a0 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -80,6 +80,7 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
+ ctx_params.n_parallel = n_parallel;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -132,7 +133,7 @@ int main(int argc, char ** argv) {
// assign the system KV cache to all parallel sequences
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
for (int32_t i = 1; i < n_parallel; ++i) {
- llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
if (n_parallel > 1) {
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7d11fcd5..a2ef0fb0 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -107,6 +107,9 @@ int main(int argc, char ** argv) {
// number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel;
+ // dedicate one sequence to the system prompt
+ params.n_parallel += 1;
+
// requests to simulate
const int32_t n_seq = params.n_sequences;
@@ -196,8 +199,8 @@ int main(int argc, char ** argv) {
}
// assign the system KV cache to all parallel sequences
- for (int32_t i = 1; i < n_clients; ++i) {
- llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
+ for (int32_t i = 1; i <= n_clients; ++i) {
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_TEE("\n");
@@ -221,15 +224,17 @@ int main(int argc, char ** argv) {
client.i_batch = batch.n_tokens;
- llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+ llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
client.n_decoded += 1;
}
if (batch.n_tokens == 0) {
// all sequences have ended - clear the entire KV cache
- for (int i = 0; i < n_clients; ++i) {
- llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
+ for (int i = 1; i <= n_clients; ++i) {
+ llama_kv_cache_seq_rm(ctx, i, -1, -1);
+ // but keep the system prompt
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_TEE("%s: clearing the KV cache\n", __func__);
@@ -255,7 +260,7 @@ int main(int argc, char ** argv) {
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
- llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
+ llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
}
// extract the logits only for the last token
@@ -366,7 +371,8 @@ int main(int argc, char ** argv) {
}
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
- llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
+ llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
+ llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
const auto t_main_end = ggml_time_us();
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9ec98938..52789ee6 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -809,7 +809,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
- const int max_seq = 4*max_tasks_per_batch;
+ const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
@@ -1086,7 +1086,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 128;
- const int max_seq = 2*max_tasks_per_batch;
+ const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
@@ -1438,7 +1438,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
- const int max_seq = 4*max_tasks_per_batch;
+ const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
@@ -1815,6 +1815,9 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
+ // ensure there's at least enough seq_ids for HellaSwag
+ params.n_parallel = std::max(4, params.n_parallel);
+
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 109ff717..59a59d56 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -659,7 +659,11 @@ struct server_context {
bool load_model(const gpt_params & params_) {
params = params_;
+ // dedicate one sequence to the system prompt
+ params.n_parallel += 1;
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ params.n_parallel -= 1; // but be sneaky about it
if (model == nullptr) {
LOG_ERROR("unable to load model", {{"model", params.model}});
return false;
@@ -1018,8 +1022,8 @@ struct server_context {
}
// assign the system KV cache to all parallel sequences
- for (int32_t i = 1; i < params.n_parallel; ++i) {
- llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+ for (int32_t i = 1; i <= params.n_parallel; ++i) {
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
}
@@ -1306,7 +1310,7 @@ struct server_context {
const int n_embd = llama_n_embd(model);
for (int i = 0; i < batch.n_tokens; ++i) {
- if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+ if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
continue;
}
@@ -1633,8 +1637,8 @@ struct server_context {
{"n_cache_tokens", slot.cache_tokens.size()}
});
- llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
- llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
+ llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
+ llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
if (slot.params.cache_prompt) {
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -1666,7 +1670,7 @@ struct server_context {
// TODO: we always have to take into account the "system_tokens"
// this is not great and needs to be improved somehow
- llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+ llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
slot.n_past += 1;
@@ -1804,9 +1808,6 @@ struct server_context {
// reuse any previously computed tokens that are common with the new prompt
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
- // remove the non-common part from the cache
- slot.cache_tokens.resize(slot.n_past);
-
// push the prompt into the sampling context (do not apply grammar)
for (int i = 0; i < slot.n_past; ++i) {
llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
@@ -1837,8 +1838,28 @@ struct server_context {
}
}
- const int p0 = (int) system_tokens.size() + slot.n_past;
- llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
+ // keep only the common part
+ int p0 = (int) system_tokens.size() + slot.n_past;
+ if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
+ // could not partially delete (likely using a non-Transformer model)
+ llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
+
+ p0 = (int) system_tokens.size();
+ if (p0 != 0) {
+ // copy over the system prompt when there is one
+ llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
+ }
+
+ // there is no common part left (except for the system prompt)
+ slot.n_past = 0;
+ slot.n_past_se = 0;
+ slot.ga_i = 0;
+ // TODO: is the system prompt ever in the sampling context?
+ llama_sampling_reset(slot.ctx_sampling);
+ }
+
+ // remove the non-common part from the cache
+ slot.cache_tokens.resize(slot.n_past);
LOG_INFO("kv cache rm [p0, end)", {
{ "id_slot", slot.id },
@@ -1863,7 +1884,7 @@ struct server_context {
}
}
- llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
+ llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -1937,9 +1958,9 @@ struct server_context {
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
- llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
- llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
- llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
+ llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
+ llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
+ llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
slot.n_past_se -= bd;