summaryrefslogtreecommitdiff
path: root/examples/server/tests/features
diff options
context:
space:
mode:
authorJohannes Gäßler <johannesg@5d6.de>2024-05-19 16:26:02 +0200
committerGitHub <noreply@github.com>2024-05-19 16:26:02 +0200
commit1b01f06db0cff5f5f600bb754fc39fde565ed56a (patch)
tree22b3cf37c4527ec24f5e2b94fb2087056b5966c2 /examples/server/tests/features
parent41858392e17abead21735309bf17cb55183d8c31 (diff)
server: add test for token probs (#7347)
Diffstat (limited to 'examples/server/tests/features')
-rw-r--r--examples/server/tests/features/results.feature52
-rw-r--r--examples/server/tests/features/steps/steps.py37
2 files changed, 80 insertions, 9 deletions
diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature
index aa0b8d0c..5deb278c 100644
--- a/examples/server/tests/features/results.feature
+++ b/examples/server/tests/features/results.feature
@@ -70,12 +70,48 @@ Feature: Results
Then all predictions are equal
Examples:
| n_parallel | temp |
- | 1 | 0.0 |
- | 2 | 0.0 |
- | 4 | 0.0 |
- | 1 | 1.0 |
- # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
+ | 1 | 0.0 |
+ | 2 | 0.0 |
+ | 4 | 0.0 |
+ | 1 | 1.0 |
+ # FIXME: These tests fail on master.
+ # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
- # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
- # | 2 | 1.0 |
- # | 4 | 1.0 |
+ # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
+ # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+ # | 2 | 1.0 |
+ # | 4 | 1.0 |
+
+ Scenario Outline: consistent token probs with same seed and prompt
+ Given <n_slots> slots
+ And <n_kv> KV cache size
+ And 1.0 temperature
+ And <n_predict> max tokens to predict
+ Then the server is starting
+ Then the server is healthy
+
+ Given 1 prompts "The meaning of life is" with seed 42
+ And concurrent completion requests
+ # Then the server is busy # Not all slots will be utilized.
+ Then the server is idle
+ And all slots are idle
+
+ Given <n_parallel> prompts "The meaning of life is" with seed 42
+ And concurrent completion requests
+ # Then the server is busy # Not all slots will be utilized.
+ Then the server is idle
+ And all slots are idle
+
+ Then all token probabilities are equal
+ Examples:
+ | n_slots | n_kv | n_predict | n_parallel |
+ | 4 | 1024 | 1 | 1 |
+ | 4 | 1024 | 1 | 4 |
+ # FIXME: These tests fail on master.
+ # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
+ # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
+ # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
+ # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+ # | 4 | 1024 | 100 | 1 |
+ # This test still fails even the above patches; the first token probabilities are already different.
+ # | 4 | 1024 | 100 | 4 |
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 577b87af..7da503f2 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -23,6 +23,7 @@ from prometheus_client import parser
def step_server_config(context, server_fqdn, server_port):
context.server_fqdn = server_fqdn
context.server_port = int(server_port)
+ context.n_threads = None
context.n_gpu_layer = None
if 'PORT' in os.environ:
context.server_port = int(os.environ['PORT'])
@@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
context.n_gpu_layer = ngl
+@step('{n_threads:d} threads')
+def step_n_threads(context, n_threads):
+ context.n_thread = n_threads
+
+
@step('{draft:d} as draft')
def step_draft(context, draft):
context.draft = draft
@@ -274,13 +280,22 @@ async def step_predictions_equal(context):
@step('all predictions are different')
@async_run_until_complete
-async def step_predictions_equal(context):
+async def step_predictions_different(context):
n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions"
assert_all_predictions_different(context.tasks_result)
context.tasks_result = []
+@step('all token probabilities are equal')
+@async_run_until_complete
+async def step_token_probabilities_equal(context):
+ n_completions = await gather_tasks_results(context)
+ assert n_completions >= 2, "need at least 2 completions"
+ assert_all_token_probabilities_equal(context.tasks_result)
+ context.tasks_result = []
+
+
@step('the completion is truncated')
def step_assert_completion_truncated(context):
step_assert_completion_truncated(context, '')
@@ -869,6 +884,7 @@ async def request_completion(prompt,
"id_slot": id_slot,
"seed": seed if seed is not None else 42,
"temperature": temperature if temperature is not None else "0.8f",
+ "n_probs": 2,
},
headers=headers,
timeout=3600) as response:
@@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
assert content_i != content_j, "contents not different"
+def assert_all_token_probabilities_equal(completion_responses):
+ n_predict = len(completion_responses[0]['completion_probabilities'])
+ if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+ for pos in range(n_predict):
+ for i, response_i in enumerate(completion_responses):
+ probs_i = response_i['completion_probabilities'][pos]['probs']
+ print(f"pos {pos}, probs {i}: {probs_i}")
+ for pos in range(n_predict):
+ for i, response_i in enumerate(completion_responses):
+ probs_i = response_i['completion_probabilities'][pos]['probs']
+ for j, response_j in enumerate(completion_responses):
+ if i == j:
+ continue
+ probs_j = response_j['completion_probabilities'][pos]['probs']
+ assert probs_i == probs_j, "contents not equal"
+
+
async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks)
if context.debug:
@@ -1261,6 +1294,8 @@ def start_server_background(context):
server_args.extend(['--batch-size', context.n_batch])
if context.n_ubatch:
server_args.extend(['--ubatch-size', context.n_ubatch])
+ if context.n_threads:
+ server_args.extend(['--threads', context.threads])
if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.draft is not None: