diff options
Diffstat (limited to 'examples/server/tests/features')
-rw-r--r-- | examples/server/tests/features/lora.feature | 36 | ||||
-rw-r--r-- | examples/server/tests/features/steps/steps.py | 21 |
2 files changed, 57 insertions, 0 deletions
diff --git a/examples/server/tests/features/lora.feature b/examples/server/tests/features/lora.feature new file mode 100644 index 00000000..7b85988a --- /dev/null +++ b/examples/server/tests/features/lora.feature @@ -0,0 +1,36 @@ +@llama.cpp +@lora +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf + And a model file stories15M_MOE-F16.gguf + And a model alias stories15M_MOE + And a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf + And 42 as server seed + And 1024 as batch size + And 1024 as ubatch size + And 2048 KV cache size + And 64 max tokens to predict + And 0.0 temperature + Then the server is starting + Then the server is healthy + + Scenario: Completion LoRA disabled + Given switch off lora adapter 0 + Given a prompt: + """ + Look in thy glass + """ + And a completion request with no api error + Then 64 tokens are predicted matching little|girl|three|years|old + + Scenario: Completion LoRA enabled + Given switch on lora adapter 0 + Given a prompt: + """ + Look in thy glass + """ + And a completion request with no api error + Then 64 tokens are predicted matching eye|love|glass|sun diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index df0814cc..6705a34f 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -7,6 +7,7 @@ import subprocess import sys import threading import time +import requests from collections.abc import Sequence from contextlib import closing from re import RegexFlag @@ -70,6 +71,7 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.user_api_key = None context.response_format = None context.temperature = None + context.lora_file = None context.tasks_result = [] context.concurrent_tasks = [] @@ -82,6 +84,12 @@ def step_download_hf_model(context, hf_file: str, hf_repo: str): context.model_hf_file = hf_file context.model_file = os.path.basename(hf_file) +@step('a lora adapter file from {lora_file_url}') +def step_download_lora_file(context, lora_file_url: str): + file_name = lora_file_url.split('/').pop() + context.lora_file = f'../../../{file_name}' + with open(context.lora_file, 'wb') as f: + f.write(requests.get(lora_file_url).content) @step('a model file {model_file}') def step_model_file(context, model_file: str): @@ -849,6 +857,17 @@ async def step_erase_slot(context, slot_id): context.response = response +@step('switch {on_or_off} lora adapter {lora_id:d}') +@async_run_until_complete +async def toggle_lora_adapter(context, on_or_off: str, lora_id: int): + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/lora-adapters', + json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}], + headers={"Content-Type": "application/json"}) as response: + context.response = response + print([{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}]) + + @step('the server responds with status code {status_code:d}') def step_server_responds_with_status_code(context, status_code): assert context.response.status == status_code @@ -1326,6 +1345,8 @@ def start_server_background(context): server_args.extend(['--grp-attn-w', context.n_ga_w]) if context.debug: server_args.append('--verbose') + if context.lora_file: + server_args.extend(['--lora', context.lora_file]) if 'SERVER_LOG_FORMAT_JSON' not in os.environ: server_args.extend(['--log-format', "text"]) |