diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-03-17 19:12:37 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-17 19:12:37 +0100 |
commit | d01b3c4c32357567f3531d4e6ceffc5d23e87583 (patch) | |
tree | 80e0a075a8b120d6b5b095a73cc36cb2a4535aed /examples/server | |
parent | cd776c37c945bf58efc8fe44b370456680cb1b59 (diff) |
common: llama_load_model_from_url using --model-url (#6098)
* common: llama_load_model_from_url with libcurl dependency
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/server')
-rw-r--r-- | examples/server/README.md | 1 | ||||
-rw-r--r-- | examples/server/server.cpp | 8 | ||||
-rw-r--r-- | examples/server/tests/README.md | 2 | ||||
-rw-r--r-- | examples/server/tests/features/embeddings.feature | 3 | ||||
-rw-r--r-- | examples/server/tests/features/environment.py | 97 | ||||
-rw-r--r-- | examples/server/tests/features/server.feature | 3 | ||||
-rw-r--r-- | examples/server/tests/features/steps/steps.py | 37 | ||||
-rw-r--r-- | examples/server/tests/requirements.txt | 1 |
8 files changed, 99 insertions, 53 deletions
diff --git a/examples/server/README.md b/examples/server/README.md index 8f8454af..755e1d53 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 895d608f..d2a8e541 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co } printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); + printf(" model download url (default: %s)\n", params.model_url.c_str()); printf(" -a ALIAS, --alias ALIAS\n"); printf(" set an alias for the model, will be added as `model` field in completion response\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); @@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } params.model = argv[i]; + } else if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_url = argv[i]; } else if (arg == "-a" || arg == "--alias") { if (++i >= argc) { invalid_param = true; diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 95a0353b..feb2b1d6 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de To run a scenario annotated with `@bug`, start: ```shell -DEBUG=ON ./tests.sh --no-skipped --tags bug +DEBUG=ON ./tests.sh --no-skipped --tags bug --stop ``` After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index 57359b26..dcf1434f 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -4,7 +4,8 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models + And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf + And a model file ggml-model-f16.gguf And a model alias bert-bge-small And 42 as server seed And 2 slots diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 8ad987e1..82104e92 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -1,10 +1,12 @@ -import errno import os +import signal import socket -import subprocess +import sys import time +import traceback from contextlib import closing -import signal + +import psutil def before_scenario(context, scenario): @@ -20,33 +22,40 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): - if context.server_process is None: - return - if scenario.status == "failed": - if 'GITHUB_ACTIONS' in os.environ: - print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") - if os.path.isfile('llama.log'): - with closing(open('llama.log', 'r')) as f: - for line in f: - print(line) - if not is_server_listening(context.server_fqdn, context.server_port): - print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n") - - if not pid_exists(context.server_process.pid): - assert False, f"Server not running pid={context.server_process.pid} ..." - - server_graceful_shutdown(context) - - # Wait few for socket to free up - time.sleep(0.05) - - attempts = 0 - while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port): - server_kill(context) - time.sleep(0.1) - attempts += 1 - if attempts > 5: - server_kill_hard(context) + try: + if 'server_process' not in context or context.server_process is None: + return + if scenario.status == "failed": + if 'GITHUB_ACTIONS' in os.environ: + print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") + if os.path.isfile('llama.log'): + with closing(open('llama.log', 'r')) as f: + for line in f: + print(line) + if not is_server_listening(context.server_fqdn, context.server_port): + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n") + + if not pid_exists(context.server_process.pid): + assert False, f"Server not running pid={context.server_process.pid} ..." + + server_graceful_shutdown(context) + + # Wait few for socket to free up + time.sleep(0.05) + + attempts = 0 + while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port): + server_kill(context) + time.sleep(0.1) + attempts += 1 + if attempts > 5: + server_kill_hard(context) + except: + exc = sys.exception() + print("error in after scenario: \n") + print(exc) + print("*** print_tb: \n") + traceback.print_tb(exc.__traceback__, file=sys.stdout) def server_graceful_shutdown(context): @@ -67,11 +76,11 @@ def server_kill_hard(context): path = context.server_path print(f"Server dangling exits, hard killing force {pid}={path}...\n") - if os.name == 'nt': - process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode() - print(process) - else: - os.kill(-pid, signal.SIGKILL) + try: + psutil.Process(pid).kill() + except psutil.NoSuchProcess: + return False + return True def is_server_listening(server_fqdn, server_port): @@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port): def pid_exists(pid): - """Check whether pid exists in the current process table.""" - if pid < 0: + try: + psutil.Process(pid) + except psutil.NoSuchProcess: return False - if os.name == 'nt': - output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode() - print(output) - return "No tasks are running" not in output - else: - try: - os.kill(pid, 0) - except OSError as e: - return e.errno == errno.EPERM - else: - return True + return True + diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 5014f326..7448986e 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -4,7 +4,8 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf + And a model file stories260K.gguf And a model alias tinyllama-2 And 42 as server seed # KV Cache corresponds to the total amount of tokens diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index a59a52d2..9e348d5f 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -5,6 +5,8 @@ import os import re import socket import subprocess +import sys +import threading import time from contextlib import closing from re import RegexFlag @@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port): context.base_url = f'http://{context.server_fqdn}:{context.server_port}' context.model_alias = None + context.model_file = None + context.model_url = None context.n_batch = None context.n_ubatch = None context.n_ctx = None @@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo): print(f"model file: {context.model_file}\n") +@step('a model file {model_file}') +def step_model_file(context, model_file): + context.model_file = model_file + + +@step('a model url {model_url}') +def step_model_url(context, model_url): + context.model_url = model_url + + @step('a model alias {model_alias}') def step_model_alias(context, model_alias): context.model_alias = model_alias @@ -141,7 +155,8 @@ def step_start_server(context): async def step_wait_for_the_server_to_be_started(context, expecting_status): match expecting_status: case 'healthy': - await wait_for_health_status(context, context.base_url, 200, 'ok') + await wait_for_health_status(context, context.base_url, 200, 'ok', + timeout=30) case 'ready' | 'idle': await wait_for_health_status(context, context.base_url, 200, 'ok', @@ -1038,8 +1053,11 @@ def start_server_background(context): server_args = [ '--host', server_listen_addr, '--port', context.server_port, - '--model', context.model_file ] + if context.model_file: + server_args.extend(['--model', context.model_file]) + if context.model_url: + server_args.extend(['--model-url', context.model_url]) if context.n_batch: server_args.extend(['--batch-size', context.n_batch]) if context.n_ubatch: @@ -1079,8 +1097,23 @@ def start_server_background(context): pkwargs = { 'creationflags': flags, + 'stdout': subprocess.PIPE, + 'stderr': subprocess.PIPE } context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], **pkwargs) + + def log_stdout(process): + for line in iter(process.stdout.readline, b''): + print(line.decode('utf-8'), end='') + thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,)) + thread_stdout.start() + + def log_stderr(process): + for line in iter(process.stderr.readline, b''): + print(line.decode('utf-8'), end='', file=sys.stderr) + thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,)) + thread_stderr.start() + print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}") diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 2e4f42ad..c2c96010 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -3,4 +3,5 @@ behave~=1.2.6 huggingface_hub~=0.20.3 numpy~=1.24.4 openai~=0.25.0 +psutil~=5.9.8 prometheus-client~=0.20.0 |