diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-02-25 13:49:43 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-25 13:49:43 +0100 |
commit | d52d7819b8ced70c642a88a59da8c78208dc58ec (patch) | |
tree | 07841f1c5b7ab748bac463e62f3fb7ce0b7f96e9 /examples/server/tests | |
parent | 12894088170f62e4cad4f8d6a3043c185b414bab (diff) |
server: concurrency fix + monitoring - add /metrics prometheus compatible endpoint (#5708)
* server: monitoring - add /metrics prometheus compatible endpoint
* server: concurrency issue, when 2 task are waiting for results, only one call thread is notified
* server: metrics - move to a dedicated struct
Diffstat (limited to 'examples/server/tests')
-rw-r--r-- | examples/server/tests/features/environment.py | 2 | ||||
-rw-r--r-- | examples/server/tests/features/server.feature | 2 | ||||
-rw-r--r-- | examples/server/tests/features/steps/steps.py | 27 | ||||
-rw-r--r-- | examples/server/tests/requirements.txt | 1 |
4 files changed, 32 insertions, 0 deletions
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 13cc8410..09e82674 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -16,6 +16,8 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): + if context.server_process is None: + return if scenario.status == "failed": if 'GITHUB_ACTIONS' in os.environ: print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 5f81d256..0139f89d 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -13,6 +13,7 @@ Feature: llama.cpp server And 1 slots And embeddings extraction And 32 server max tokens to predict + And prometheus compatible metrics exposed Then the server is starting Then the server is healthy @@ -25,6 +26,7 @@ Feature: llama.cpp server And <n_predict> max tokens to predict And a completion request with no api error Then <n_predicted> tokens are predicted matching <re_content> + And prometheus metrics are exposed Examples: Prompts | prompt | n_predict | re_content | n_predicted | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 9c825fdb..051fd440 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -13,6 +13,7 @@ import aiohttp import openai from behave import step from behave.api.async_step import async_run_until_complete +from prometheus_client import parser @step(u"a server listening on {server_fqdn}:{server_port}") @@ -34,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port): context.server_api_key = None context.server_continuous_batching = False context.server_embeddings = False + context.server_metrics = False + context.server_process = None context.server_seed = None context.user_api_key = None @@ -82,6 +85,11 @@ def step_server_embeddings(context): context.server_embeddings = True +@step(u'prometheus compatible metrics exposed') +def step_server_metrics(context): + context.server_metrics = True + + @step(u"the server is starting") def step_start_server(context): start_server_background(context) @@ -424,6 +432,23 @@ def step_check_options_header_value(context, cors_header, cors_header_value): assert context.options_response.headers[cors_header] == cors_header_value +@step(u'prometheus metrics are exposed') +@async_run_until_complete +async def step_prometheus_metrics_exported(context): + async with aiohttp.ClientSession() as session: + async with await session.get(f'{context.base_url}/metrics') as metrics_response: + assert metrics_response.status == 200 + assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" + metrics_raw = await metrics_response.text() + metric_exported = False + for metric in parser.text_string_to_metric_families(metrics_raw): + match metric.name: + case "llamacpp:kv_cache_usage_ratio": + assert len(metric.samples) > 0 + metric_exported = True + assert metric_exported, "No metrics exported" + + async def concurrent_requests(context, f_completion, *args, **kwargs): n_prompts = len(context.prompts) if context.debug: @@ -753,6 +778,8 @@ def start_server_background(context): server_args.append('--cont-batching') if context.server_embeddings: server_args.append('--embedding') + if context.server_metrics: + server_args.append('--metrics') if context.model_alias is not None: server_args.extend(['--alias', context.model_alias]) if context.n_ctx is not None: diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 3e51b12d..334fa4a7 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,3 +1,4 @@ aiohttp~=3.9.3 behave~=1.2.6 openai~=0.25.0 +prometheus-client~=0.20.0 |