diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-04-06 05:40:47 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-06 05:40:47 +0200 |
commit | 75cd4c77292034ecec587ecb401366f57338f7c0 (patch) | |
tree | de137718780505410bc75ce219f4bc164961c4fd /examples/server/bench/script.js | |
parent | a8bd14d55717754a1f48313a846a2b16fa998ad2 (diff) |
ci: bench: support sse and fix prompt processing time / server: add tokens usage in stream OAI response (#6495)
* ci: bench: support sse and fix prompt processing time
server: add tokens usage in stream mode
* ci: bench: README.md EOL
* ci: bench: remove total pp and tg as it is not accurate
* ci: bench: fix case when there is no token generated
* ci: bench: change to the 95 percentile for pp and tg as it is closer to what the server exports in metrics
* ci: bench: fix finish reason rate
Diffstat (limited to 'examples/server/bench/script.js')
-rw-r--r-- | examples/server/bench/script.js | 66 |
1 files changed, 47 insertions, 19 deletions
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index dc41e8d9..c4c486cd 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -1,4 +1,4 @@ -import http from 'k6/http' +import sse from 'k6/x/sse' import {check, sleep} from 'k6' import {SharedArray} from 'k6/data' import {Counter, Rate, Trend} from 'k6/metrics' @@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () { const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') + const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') +const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -86,36 +88,62 @@ export default function () { } ], "model": model, - "stream": false, + "stream": true, "seed": 42, "max_tokens": max_tokens } - const body = JSON.stringify(payload) + const params = {method: 'POST', body: JSON.stringify(payload)}; + + const startTime = new Date() + let promptEvalEndTime = null + let prompt_tokens = 0 + let completions_tokens = 0 + let finish_reason = null + const res = sse.open(`${server_url}/chat/completions`, params, function (client) { + client.on('event', function (event) { + if (promptEvalEndTime == null) { + promptEvalEndTime = new Date() + } - let res = http.post(`${server_url}/chat/completions`, body, { - headers: {'Content-Type': 'application/json'}, - timeout: '300s' - }) + let chunk = JSON.parse(event.data) + let choice = chunk.choices[0] + if (choice.finish_reason) { + finish_reason = choice.finish_reason + } - check(res, {'success completion': (r) => r.status === 200}) + if (chunk.usage) { + prompt_tokens = chunk.usage.prompt_tokens + llamacpp_prompt_tokens.add(prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(prompt_tokens) + + completions_tokens = chunk.usage.completion_tokens + llamacpp_completion_tokens.add(completions_tokens) + llamacpp_completion_tokens_total_counter.add(completions_tokens) + } + }) - if (res.status === 200) { - const completions = res.json() + client.on('error', function (e) { + console.log('An unexpected error occurred: ', e.error()); + throw e; + }) + }) - llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) - llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) + check(res, {'success completion': (r) => r.status === 200}) - llamacpp_completion_tokens.add(completions.usage.completion_tokens) - llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) + const endTime = new Date() - llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') - llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + const promptEvalTime = promptEvalEndTime - startTime + if (promptEvalTime > 0) { + llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3) + } - llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) - } else { - console.error(`response: ${res.body} request=${payload}`) + const completion_time = endTime - promptEvalEndTime + if (completions_tokens > 0 && completion_time > 0) { + llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3) } + llamacpp_completions_truncated_rate.add(finish_reason === 'length') + llamacpp_completions_stop_rate.add(finish_reason === 'stop') sleep(0.3) } |