Server: fix seed for multiple slots (#6835)

* Server: add tests for consistent results * sampling: separate rng per sampling context
author: Johannes Gäßler <johannesg@5d6.de> 2024-04-24 11:08:36 +0200
committer: GitHub <noreply@github.com> 2024-04-24 11:08:36 +0200
commit: 28103f4832e301a9c84d44ff0df9d75d46ab6c76 (patch)
tree: 8ba391e3a7e0ce9a20d4b41782ef133bd7e32738 /examples/server/tests/features
parent: c0d1b3e03e27634ac2871761f5033cf9324d472d (diff)
2 files changed, 91 insertions, 0 deletions
diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature
new file mode 100644
index 00000000..f17120f7
--- /dev/null
+++ b/examples/server/tests/features/results.feature
@@ -0,0 +1,57 @@
+@llama.cpp
+@results
+Feature: Results
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
+    And   a model file test-model-00001-of-00003.gguf
+    And   128 as batch size
+    And   256 KV cache size
+    And   128 max tokens to predict
+
+  Scenario Outline: Multi users completion
+    Given <n_slots> slots
+    And   continuous batching
+    Then  the server is starting
+    Then  the server is healthy
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given concurrent completion requests
+    Then the server is busy
+    Then the server is idle
+    And  all slots are idle
+    Then all predictions are equal
+    Examples:
+      | n_slots |
+      | 1       |
+      | 2       |
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index ca400efa..f71e0d70 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port):
     context.server_metrics = False
     context.server_process = None
     context.seed = None
+    context.draft = None
     context.server_seed = None
     context.user_api_key = None
     context.response_format = None
@@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl):
     context.n_gpu_layer = ngl
 
 
+@step('{draft:d} as draft')
+def step_draft(context, draft):
+    context.draft = draft
+
+
 @step('{n_ctx:d} KV cache size')
 def step_n_ctx(context, n_ctx):
     context.n_ctx = n_ctx
@@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n):
     assert_n_tokens_predicted(context.completion, predicted_n)
 
 
+@step('all predictions are equal')
+@async_run_until_complete
+async def step_predictions_equal(context):
+    n_completions = await gather_tasks_results(context)
+    assert n_completions >= 2, "need at least 2 completions"
+    assert_all_predictions_equal(context.tasks_result)
+    context.tasks_result = []
+
+
 @step('the completion is  truncated')
 def step_assert_completion_truncated(context):
     step_assert_completion_truncated(context, '')
@@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
         assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
                                                      f' {n_predicted} <> {expected_predicted_n}')
 
+def assert_all_predictions_equal(completion_responses):
+    content_0 = completion_responses[0]['content']
+
+    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+        print(f"content 0: {content_0}")
+
+    i = 1
+    for response in completion_responses[1:]:
+        content = response['content']
+
+        if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+            print(f"content {i}: {content}")
+
+        assert content == content_0, "contents not equal"
+
+        i += 1
+
 
 async def gather_tasks_results(context):
     n_tasks = len(context.concurrent_tasks)
@@ -1148,6 +1180,8 @@ def start_server_background(context):
         server_args.extend(['--ubatch-size', context.n_ubatch])
     if context.n_gpu_layer:
         server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
+    if context.draft is not None:
+        server_args.extend(['--draft', context.draft])
     if context.server_continuous_batching:
         server_args.append('--cont-batching')
     if context.server_embeddings:
author	Johannes Gäßler <johannesg@5d6.de>	2024-04-24 11:08:36 +0200
committer	GitHub <noreply@github.com>	2024-04-24 11:08:36 +0200
commit	28103f4832e301a9c84d44ff0df9d75d46ab6c76 (patch)
tree	8ba391e3a7e0ce9a20d4b41782ef133bd7e32738 /examples/server/tests/features
parent	c0d1b3e03e27634ac2871761f5033cf9324d472d (diff)