summaryrefslogtreecommitdiff
path: root/examples/server
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2024-03-13 18:54:21 +0100
committerGitHub <noreply@github.com>2024-03-13 18:54:21 +0100
commitf30ea47a87ed4446ad55adb265755dc9102956a2 (patch)
treefc885962ca3d537cfdfbd6b4a2820b7c864b1ee0 /examples/server
parentd8fd0ccf6ac8b07791ffd1575eed436930854ae3 (diff)
llama : add pipeline parallelism support (#6017)
* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs ggml-ci * server : add -ub, --ubatch-size parameter * fix server embedding test * llama : fix Mamba inference for pipeline parallelism Tested to work correctly with both `main` and `parallel` examples. * llama : limit max batch size to n_batch * add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism default increase to 4 (from 2) changing this value may improve performance for some systems, but increases memory usage * fix hip build * fix sycl build (disable cpy_tensor_async) * fix hip build * llama : limit n_batch and n_ubatch to n_ctx during context creation * llama : fix norm backend * batched-bench : sync after decode * swiftui : sync after decode * ggml : allow ggml_get_rows to use multiple threads if they are available * check n_ubatch >= n_tokens with non-casual attention * llama : do not limit n_batch to n_ctx with non-casual attn * server : construct batch with size of llama_n_batch * ggml_backend_cpu_graph_compute : fix return value when alloc fails * llama : better n_batch and n_ubatch comment * fix merge * small fix * reduce default n_batch to 2048 --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/server')
-rw-r--r--examples/server/server.cpp32
-rw-r--r--examples/server/tests/features/embeddings.feature1
-rw-r--r--examples/server/tests/features/steps/steps.py8
3 files changed, 33 insertions, 8 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3172d96d..895d608f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -147,7 +147,7 @@ struct server_slot {
int32_t n_decoded = 0;
int32_t n_remaining = -1;
int32_t i_batch = -1;
- int32_t n_predict = -1;
+ int32_t n_predict = -1; // TODO: disambiguate from params.n_predict
int32_t n_prompt_tokens = 0;
int32_t n_prompt_tokens_processed = 0;
@@ -739,7 +739,13 @@ struct server_context {
default_generation_settings_for_props = get_formated_generation(slots.front());
default_generation_settings_for_props["seed"] = -1;
- batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+ // the update_slots() logic will always submit a maximum of n_batch tokens
+ // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
+ {
+ const int32_t n_batch = llama_n_batch(ctx);
+
+ batch = llama_batch_init(n_batch, 0, params.n_parallel);
+ }
metrics.init();
}
@@ -1036,8 +1042,10 @@ struct server_context {
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) {
- const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
+ const int32_t n_batch = llama_n_batch(ctx);
+
+ for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
+ const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
llama_batch batch_view = {
n_tokens,
batch.token + i,
@@ -1226,7 +1234,7 @@ struct server_context {
{"mirostat_eta", slot.sparams.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl},
{"stop", slot.params.antiprompt},
- {"n_predict", slot.params.n_predict},
+ {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict
{"n_keep", params.n_keep},
{"ignore_eos", ignore_eos},
{"stream", slot.params.stream},
@@ -1738,7 +1746,8 @@ struct server_context {
}
// process in chunks of params.n_batch
- int32_t n_batch = params.n_batch;
+ int32_t n_batch = llama_n_batch(ctx);
+ int32_t n_ubatch = llama_n_ubatch(ctx);
// next, batch any pending prompts without exceeding n_batch
if (params.cont_batching || batch.n_tokens == 0) {
@@ -1811,7 +1820,7 @@ struct server_context {
if (slot.embedding) {
// this prompt is too large to process - discard it
- if (slot.n_prompt_tokens > n_batch) {
+ if (slot.n_prompt_tokens > n_ubatch) {
slot.state = SLOT_STATE_PROCESSING;
slot.command = SLOT_COMMAND_NONE;
slot.release();
@@ -2157,7 +2166,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
printf(" -dt N, --defrag-thold N\n");
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
- printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
+ printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
+ printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_supports_mlock()) {
@@ -2424,6 +2434,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break;
}
params.n_batch = std::stoi(argv[i]);
+ } else if (arg == "-ub" || arg == "--ubatch-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_ubatch = std::stoi(argv[i]);
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
index b47661e9..57359b26 100644
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -9,6 +9,7 @@ Feature: llama.cpp server
And 42 as server seed
And 2 slots
And 1024 as batch size
+ And 1024 as ubatch size
And 2048 KV cache size
And embeddings extraction
Then the server is starting
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 98c2b617..cfa9f96e 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -33,6 +33,7 @@ def step_server_config(context, server_fqdn, server_port):
context.model_alias = None
context.n_batch = None
+ context.n_ubatch = None
context.n_ctx = None
context.n_ga = None
context.n_ga_w = None
@@ -278,6 +279,11 @@ def step_n_batch(context, n_batch):
context.n_batch = n_batch
+@step('{n_ubatch:d} as ubatch size')
+def step_n_ubatch(context, n_ubatch):
+ context.n_ubatch = n_ubatch
+
+
@step('{seed:d} as seed')
def step_seed(context, seed):
context.seed = seed
@@ -1029,6 +1035,8 @@ def start_server_background(context):
]
if context.n_batch:
server_args.extend(['--batch-size', context.n_batch])
+ if context.n_ubatch:
+ server_args.extend(['--ubatch-size', context.n_ubatch])
if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.server_continuous_batching: