diff options
Diffstat (limited to 'examples')
-rw-r--r-- | examples/server/bench/README.md | 42 | ||||
-rw-r--r-- | examples/server/bench/bench.py | 11 | ||||
-rw-r--r-- | examples/server/bench/script.js | 66 | ||||
-rw-r--r-- | examples/server/utils.hpp | 9 |
4 files changed, 98 insertions, 30 deletions
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index a53ad64d..23a3ec97 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -2,13 +2,15 @@ Benchmark is using [k6](https://k6.io/). -##### Install k6 +##### Install k6 and sse extension -Follow instruction from: https://k6.io/docs/get-started/installation/ +SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. -Example for ubuntu: +Example: ```shell -snap install k6 +go install go.k6.io/xk6/cmd/xk6@latest +xk6 build master \ +--with github.com/phymbert/xk6-sse ``` #### Download a dataset @@ -46,7 +48,7 @@ server --host localhost --port 8080 \ For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run: ```shell -k6 run script.js --duration 10m --iterations 500 --vus 8 +./k6 run script.js --duration 10m --iterations 500 --vus 8 ``` The benchmark values can be overridden with: @@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with: ```shell curl http://localhost:8080/metrics ``` + +### Using the CI python script +The `bench.py` script does several steps: +- start the server +- define good variable for k6 +- run k6 script +- extract metrics from prometheus + +It aims to be used in the CI, but you can run it manually: + +```shell +LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \ + --runner-label local \ + --name local \ + --branch `git rev-parse --abbrev-ref HEAD` \ + --commit `git rev-parse HEAD` \ + --scenario script.js \ + --duration 5m \ + --hf-repo ggml-org/models \ + --hf-file phi-2/ggml-model-q4_0.gguf \ + --model-path-prefix models \ + --parallel 4 \ + -ngl 33 \ + --batch-size 2048 \ + --ubatch-size 256 \ + --ctx-size 4096 \ + --n-prompts 200 \ + --max-prompt-tokens 256 \ + --max-tokens 256 +``` diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 86eeeccf..6ca637bd 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None: data['metrics'][metric_name][metric_metric]=value github_env.write( f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n") - token_seconds = data['metrics']['llamacpp_tokens_second']['avg'] iterations = data['root_group']['checks']['success completion']['passes'] except Exception: @@ -181,16 +180,16 @@ xychart-beta bench_results = { "i": iterations, "req": { - "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2), + "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2), "avg": round(data['metrics']["http_req_duration"]["avg"], 2), }, "pp": { - "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2), - "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2), + "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2), + "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2), "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), }, "tg": { - "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2), + "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2), "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), }, @@ -206,7 +205,7 @@ xychart-beta def start_benchmark(args): - k6_path = 'k6' + k6_path = './k6' if 'BENCH_K6_BIN_PATH' in os.environ: k6_path = os.environ['BENCH_K6_BIN_PATH'] k6_args = [ diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index dc41e8d9..c4c486cd 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -1,4 +1,4 @@ -import http from 'k6/http' +import sse from 'k6/x/sse' import {check, sleep} from 'k6' import {SharedArray} from 'k6/data' import {Counter, Rate, Trend} from 'k6/metrics' @@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () { const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') + const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') +const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -86,36 +88,62 @@ export default function () { } ], "model": model, - "stream": false, + "stream": true, "seed": 42, "max_tokens": max_tokens } - const body = JSON.stringify(payload) + const params = {method: 'POST', body: JSON.stringify(payload)}; + + const startTime = new Date() + let promptEvalEndTime = null + let prompt_tokens = 0 + let completions_tokens = 0 + let finish_reason = null + const res = sse.open(`${server_url}/chat/completions`, params, function (client) { + client.on('event', function (event) { + if (promptEvalEndTime == null) { + promptEvalEndTime = new Date() + } - let res = http.post(`${server_url}/chat/completions`, body, { - headers: {'Content-Type': 'application/json'}, - timeout: '300s' - }) + let chunk = JSON.parse(event.data) + let choice = chunk.choices[0] + if (choice.finish_reason) { + finish_reason = choice.finish_reason + } - check(res, {'success completion': (r) => r.status === 200}) + if (chunk.usage) { + prompt_tokens = chunk.usage.prompt_tokens + llamacpp_prompt_tokens.add(prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(prompt_tokens) + + completions_tokens = chunk.usage.completion_tokens + llamacpp_completion_tokens.add(completions_tokens) + llamacpp_completion_tokens_total_counter.add(completions_tokens) + } + }) - if (res.status === 200) { - const completions = res.json() + client.on('error', function (e) { + console.log('An unexpected error occurred: ', e.error()); + throw e; + }) + }) - llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) - llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) + check(res, {'success completion': (r) => r.status === 200}) - llamacpp_completion_tokens.add(completions.usage.completion_tokens) - llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) + const endTime = new Date() - llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') - llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + const promptEvalTime = promptEvalEndTime - startTime + if (promptEvalTime > 0) { + llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3) + } - llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) - } else { - console.error(`response: ${res.body} request=${payload}`) + const completion_time = endTime - promptEvalEndTime + if (completions_tokens > 0 && completion_time > 0) { + llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3) } + llamacpp_completions_truncated_rate.add(finish_reason === 'length') + llamacpp_completions_stop_rate.add(finish_reason === 'stop') sleep(0.3) } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 47cc53c2..a8d43ac6 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -567,6 +567,15 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st {"model", modelname}, {"object", "chat.completion.chunk"} }; + if (!finish_reason.empty()) { + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + ret.push_back({"usage", json { + {"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens} + }}); + } return std::vector<json>({ret}); } |