summaryrefslogtreecommitdiff
path: root/examples/server/tests/features
diff options
context:
space:
mode:
Diffstat (limited to 'examples/server/tests/features')
-rw-r--r--examples/server/tests/features/environment.py2
-rw-r--r--examples/server/tests/features/server.feature2
-rw-r--r--examples/server/tests/features/steps/steps.py27
3 files changed, 31 insertions, 0 deletions
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
index 13cc8410..09e82674 100644
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -16,6 +16,8 @@ def before_scenario(context, scenario):
def after_scenario(context, scenario):
+ if context.server_process is None:
+ return
if scenario.status == "failed":
if 'GITHUB_ACTIONS' in os.environ:
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index 5f81d256..0139f89d 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -13,6 +13,7 @@ Feature: llama.cpp server
And 1 slots
And embeddings extraction
And 32 server max tokens to predict
+ And prometheus compatible metrics exposed
Then the server is starting
Then the server is healthy
@@ -25,6 +26,7 @@ Feature: llama.cpp server
And <n_predict> max tokens to predict
And a completion request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
+ And prometheus metrics are exposed
Examples: Prompts
| prompt | n_predict | re_content | n_predicted |
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 9c825fdb..051fd440 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -13,6 +13,7 @@ import aiohttp
import openai
from behave import step
from behave.api.async_step import async_run_until_complete
+from prometheus_client import parser
@step(u"a server listening on {server_fqdn}:{server_port}")
@@ -34,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port):
context.server_api_key = None
context.server_continuous_batching = False
context.server_embeddings = False
+ context.server_metrics = False
+ context.server_process = None
context.server_seed = None
context.user_api_key = None
@@ -82,6 +85,11 @@ def step_server_embeddings(context):
context.server_embeddings = True
+@step(u'prometheus compatible metrics exposed')
+def step_server_metrics(context):
+ context.server_metrics = True
+
+
@step(u"the server is starting")
def step_start_server(context):
start_server_background(context)
@@ -424,6 +432,23 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
assert context.options_response.headers[cors_header] == cors_header_value
+@step(u'prometheus metrics are exposed')
+@async_run_until_complete
+async def step_prometheus_metrics_exported(context):
+ async with aiohttp.ClientSession() as session:
+ async with await session.get(f'{context.base_url}/metrics') as metrics_response:
+ assert metrics_response.status == 200
+ assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
+ metrics_raw = await metrics_response.text()
+ metric_exported = False
+ for metric in parser.text_string_to_metric_families(metrics_raw):
+ match metric.name:
+ case "llamacpp:kv_cache_usage_ratio":
+ assert len(metric.samples) > 0
+ metric_exported = True
+ assert metric_exported, "No metrics exported"
+
+
async def concurrent_requests(context, f_completion, *args, **kwargs):
n_prompts = len(context.prompts)
if context.debug:
@@ -753,6 +778,8 @@ def start_server_background(context):
server_args.append('--cont-batching')
if context.server_embeddings:
server_args.append('--embedding')
+ if context.server_metrics:
+ server_args.append('--metrics')
if context.model_alias is not None:
server_args.extend(['--alias', context.model_alias])
if context.n_ctx is not None: