summaryrefslogtreecommitdiff
path: root/examples/server/tests/features/steps/steps.py
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2024-03-13 18:54:21 +0100
committerGitHub <noreply@github.com>2024-03-13 18:54:21 +0100
commitf30ea47a87ed4446ad55adb265755dc9102956a2 (patch)
treefc885962ca3d537cfdfbd6b4a2820b7c864b1ee0 /examples/server/tests/features/steps/steps.py
parentd8fd0ccf6ac8b07791ffd1575eed436930854ae3 (diff)
llama : add pipeline parallelism support (#6017)
* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs ggml-ci * server : add -ub, --ubatch-size parameter * fix server embedding test * llama : fix Mamba inference for pipeline parallelism Tested to work correctly with both `main` and `parallel` examples. * llama : limit max batch size to n_batch * add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism default increase to 4 (from 2) changing this value may improve performance for some systems, but increases memory usage * fix hip build * fix sycl build (disable cpy_tensor_async) * fix hip build * llama : limit n_batch and n_ubatch to n_ctx during context creation * llama : fix norm backend * batched-bench : sync after decode * swiftui : sync after decode * ggml : allow ggml_get_rows to use multiple threads if they are available * check n_ubatch >= n_tokens with non-casual attention * llama : do not limit n_batch to n_ctx with non-casual attn * server : construct batch with size of llama_n_batch * ggml_backend_cpu_graph_compute : fix return value when alloc fails * llama : better n_batch and n_ubatch comment * fix merge * small fix * reduce default n_batch to 2048 --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/server/tests/features/steps/steps.py')
-rw-r--r--examples/server/tests/features/steps/steps.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 98c2b617..cfa9f96e 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -33,6 +33,7 @@ def step_server_config(context, server_fqdn, server_port):
context.model_alias = None
context.n_batch = None
+ context.n_ubatch = None
context.n_ctx = None
context.n_ga = None
context.n_ga_w = None
@@ -278,6 +279,11 @@ def step_n_batch(context, n_batch):
context.n_batch = n_batch
+@step('{n_ubatch:d} as ubatch size')
+def step_n_ubatch(context, n_ubatch):
+ context.n_ubatch = n_ubatch
+
+
@step('{seed:d} as seed')
def step_seed(context, seed):
context.seed = seed
@@ -1029,6 +1035,8 @@ def start_server_background(context):
]
if context.n_batch:
server_args.extend(['--batch-size', context.n_batch])
+ if context.n_ubatch:
+ server_args.extend(['--ubatch-size', context.n_ubatch])
if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.server_continuous_batching: