From 28103f4832e301a9c84d44ff0df9d75d46ab6c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 24 Apr 2024 11:08:36 +0200 Subject: Server: fix seed for multiple slots (#6835) * Server: add tests for consistent results * sampling: separate rng per sampling context --- examples/server/tests/features/results.feature | 57 ++++++++++++++++++++++++++ examples/server/tests/features/steps/steps.py | 34 +++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 examples/server/tests/features/results.feature (limited to 'examples/server/tests') diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature new file mode 100644 index 00000000..f17120f7 --- /dev/null +++ b/examples/server/tests/features/results.feature @@ -0,0 +1,57 @@ +@llama.cpp +@results +Feature: Results + + Background: Server startup + Given a server listening on localhost:8080 + And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models + And a model file test-model-00001-of-00003.gguf + And 128 as batch size + And 256 KV cache size + And 128 max tokens to predict + + Scenario Outline: Multi users completion + Given slots + And continuous batching + Then the server is starting + Then the server is healthy + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given 42 as seed + And a prompt: + """ + Write a very long story about AI. + """ + + Given concurrent completion requests + Then the server is busy + Then the server is idle + And all slots are idle + Then all predictions are equal + Examples: + | n_slots | + | 1 | + | 2 | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index ca400efa..f71e0d70 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port): context.server_metrics = False context.server_process = None context.seed = None + context.draft = None context.server_seed = None context.user_api_key = None context.response_format = None @@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl): context.n_gpu_layer = ngl +@step('{draft:d} as draft') +def step_draft(context, draft): + context.draft = draft + + @step('{n_ctx:d} KV cache size') def step_n_ctx(context, n_ctx): context.n_ctx = n_ctx @@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n): assert_n_tokens_predicted(context.completion, predicted_n) +@step('all predictions are equal') +@async_run_until_complete +async def step_predictions_equal(context): + n_completions = await gather_tasks_results(context) + assert n_completions >= 2, "need at least 2 completions" + assert_all_predictions_equal(context.tasks_result) + context.tasks_result = [] + + @step('the completion is truncated') def step_assert_completion_truncated(context): step_assert_completion_truncated(context, '') @@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' f' {n_predicted} <> {expected_predicted_n}') +def assert_all_predictions_equal(completion_responses): + content_0 = completion_responses[0]['content'] + + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + print(f"content 0: {content_0}") + + i = 1 + for response in completion_responses[1:]: + content = response['content'] + + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + print(f"content {i}: {content}") + + assert content == content_0, "contents not equal" + + i += 1 + async def gather_tasks_results(context): n_tasks = len(context.concurrent_tasks) @@ -1148,6 +1180,8 @@ def start_server_background(context): server_args.extend(['--ubatch-size', context.n_ubatch]) if context.n_gpu_layer: server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) + if context.draft is not None: + server_args.extend(['--draft', context.draft]) if context.server_continuous_batching: server_args.append('--cont-batching') if context.server_embeddings: -- cgit v1.2.3