summaryrefslogtreecommitdiff
path: root/examples/server
diff options
context:
space:
mode:
authorPierrick Hymbert <pierrick.hymbert@gmail.com>2024-03-09 10:30:04 +0100
committerGitHub <noreply@github.com>2024-03-09 11:30:04 +0200
commitfd72d2d2a5e79d61ccef6af3d15f16e5e5cbc352 (patch)
tree3a8912adc90c34470fa1e3acb6dd861990159ec7 /examples/server
parentc2101a2e909ac7c08976d414e64e96c90ee5fa9e (diff)
server: tests: add truncated prompt tests, better kv cache size (#5933)
* server: tests: add truncated prompt tests, better size * server, tests : update regex --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/server')
-rw-r--r--examples/server/server.cpp23
-rw-r--r--examples/server/tests/features/parallel.feature5
-rw-r--r--examples/server/tests/features/server.feature41
-rw-r--r--examples/server/tests/features/steps/steps.py35
4 files changed, 81 insertions, 23 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 59a59d56..6f444998 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1128,6 +1128,7 @@ struct server_context {
LOG_VERBOSE("stopped by limit", {
{"id_slot", slot.id},
+ {"id_task", slot.id_task},
{"n_decoded", slot.n_decoded},
{"n_predict", slot.params.n_predict},
});
@@ -1141,6 +1142,8 @@ struct server_context {
}
LOG_VERBOSE("next token", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
{"token", result.tok},
{"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
{"has_next_token", slot.has_next_token},
@@ -1750,6 +1753,15 @@ struct server_context {
slot.n_past = 0;
slot.n_prompt_tokens = prompt_tokens.size();
+ LOG_VERBOSE("prompt tokenized", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_ctx", slot.n_ctx},
+ {"n_keep", slot.params.n_keep},
+ {"n_prompt_tokens", slot.n_prompt_tokens},
+ {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
+ });
+
if (slot.embedding) {
// this prompt is too large to process - discard it
if (slot.n_prompt_tokens > n_batch) {
@@ -1788,10 +1800,13 @@ struct server_context {
slot.n_prompt_tokens = prompt_tokens.size();
LOG_VERBOSE("input truncated", {
- {"n_ctx", slot.n_ctx},
- {"n_keep", slot.params.n_keep},
- {"n_left", n_left},
- {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_ctx", slot.n_ctx},
+ {"n_keep", slot.params.n_keep},
+ {"n_left", n_left},
+ {"n_prompt_tokens", slot.n_prompt_tokens},
+ {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
});
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
index 066698c8..a66fed62 100644
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -6,8 +6,8 @@ Feature: Parallel
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And 42 as server seed
- And 512 as batch size
- And 64 KV cache size
+ And 128 as batch size
+ And 256 KV cache size
And 2 slots
And continuous batching
Then the server is starting
@@ -76,6 +76,7 @@ Feature: Parallel
| disabled | 128 |
| enabled | 64 |
+
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
Given a prompt:
"""
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index 878ac136..aa132fa3 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -10,11 +10,10 @@ Feature: llama.cpp server
# KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130
# see --ctx-size and #5568
- And 32 KV cache size
- And 512 as batch size
- And 1 slots
- And embeddings extraction
- And 32 server max tokens to predict
+ And 256 KV cache size
+ And 32 as batch size
+ And 2 slots
+ And 64 server max tokens to predict
And prometheus compatible metrics exposed
Then the server is starting
Then the server is healthy
@@ -23,18 +22,35 @@ Feature: llama.cpp server
Then the server is ready
And all slots are idle
+
Scenario Outline: Completion
Given a prompt <prompt>
And <n_predict> max tokens to predict
And a completion request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
+ And the completion is <truncated> truncated
+ And <n_prompt> prompt tokens are processed
And prometheus metrics are exposed
And metric llamacpp:tokens_predicted is <n_predicted>
Examples: Prompts
- | prompt | n_predict | re_content | n_predicted |
- | I believe the meaning of life is | 8 | (read\|going)+ | 8 |
- | Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 |
+ | prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
+ | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
+ | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids)+ | 46 | 64 | not |
+
+ Scenario: Completion prompt truncated
+ Given a prompt:
+ """
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+ Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+ Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+ """
+ And a completion request with no api error
+ Then 64 tokens are predicted matching fun|Annaks|popcorns
+ And the completion is truncated
+ And 109 prompt tokens are processed
+
Scenario Outline: OAI Compatibility
Given a model <model>
@@ -44,11 +60,14 @@ Feature: llama.cpp server
And streaming is <enable_streaming>
Given an OAI compatible chat completions request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
+ And <n_prompt> prompt tokens are processed
+ And the completion is <truncated> truncated
Examples: Prompts
- | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
- | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
- | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
+ | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
+ | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
+ | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird)+ | -1 | 64 | enabled | |
+
Scenario: Tokenize / Detokenize
When tokenizing:
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index d7f00583..0076f805 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -196,12 +196,30 @@ async def step_request_completion(context, api_error):
@step(u'{predicted_n:d} tokens are predicted matching {re_content}')
def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
- assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content)
+ context.completion = context.tasks_result.pop()
+ assert_n_tokens_predicted(context.completion, predicted_n, re_content)
@step(u'{predicted_n:d} tokens are predicted')
def step_n_tokens_predicted(context, predicted_n):
- assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n)
+ context.completion = context.tasks_result.pop()
+ assert_n_tokens_predicted(context.completion, predicted_n)
+
+
+@step(u'the completion is truncated')
+def step_assert_completion_truncated(context):
+ step_assert_completion_truncated(context, '')
+
+
+@step(u'the completion is {truncated} truncated')
+def step_assert_completion_truncated(context, truncated):
+ truncated = truncated != "not"
+ assert context.completion['truncated'] == truncated, f'{context.completion}'
+
+
+@step(u'{n_prompt:d} prompt tokens are processed')
+def step_impl(context, n_prompt):
+ assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
@step(u'a user prompt {user_prompt}')
@@ -722,7 +740,8 @@ async def oai_chat_completions(user_prompt,
completion_response = {
'content': '',
'timings': {
- 'predicted_n': 0
+ 'predicted_n': 0,
+ 'prompt_n': 0
}
}
if async_client:
@@ -763,7 +782,8 @@ async def oai_chat_completions(user_prompt,
completion_response = {
'content': chat_completion_raw['choices'][0]['message'],
'timings': {
- 'predicted_n': chat_completion_raw['usage']['completion_tokens']
+ 'predicted_n': chat_completion_raw['usage']['completion_tokens'],
+ 'prompt_n': chat_completion_raw['usage']['prompt_tokens']
}
}
else:
@@ -792,13 +812,16 @@ async def oai_chat_completions(user_prompt,
if 'content' in delta:
completion_response['content'] += delta['content']
completion_response['timings']['predicted_n'] += 1
+ completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
else:
assert len(chat_completion.choices) == 1
completion_response = {
'content': chat_completion.choices[0].message.content,
'timings': {
- 'predicted_n': chat_completion.usage.completion_tokens
- }
+ 'predicted_n': chat_completion.usage.completion_tokens,
+ 'prompt_n': chat_completion.usage.prompt_tokens
+ },
+ 'truncated': chat_completion.choices[0].finish_reason != 'stop'
}
if debug:
print("OAI response formatted to llama.cpp:", completion_response)