diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-07-27 07:55:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-27 07:55:01 +0200 |
commit | 154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch) | |
tree | 81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /examples/server | |
parent | 0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff) |
Merge mainline llama.cpp (#3)
* Merging mainline - WIP
* Merging mainline - WIP
AVX2 and CUDA appear to work.
CUDA performance seems slightly (~1-2%) lower as it is so often
the case with llama.cpp/ggml after some "improvements" have been made.
* Merging mainline - fix Metal
* Remove check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'examples/server')
-rw-r--r-- | examples/server/CMakeLists.txt | 15 | ||||
-rw-r--r-- | examples/server/README.md | 347 | ||||
-rw-r--r-- | examples/server/bench/bench.py | 11 | ||||
-rw-r--r-- | examples/server/public/completion.js | 4 | ||||
-rw-r--r-- | examples/server/public/index-new.html | 12 | ||||
-rw-r--r-- | examples/server/public/index.html | 182 | ||||
-rw-r--r-- | examples/server/public/json-schema-to-grammar.mjs | 306 | ||||
-rw-r--r-- | examples/server/public_simplechat/readme.md | 37 | ||||
-rw-r--r-- | examples/server/public_simplechat/simplechat.js | 79 | ||||
-rw-r--r-- | examples/server/public_simplechat/simplechat_screens.webp | bin | 0 -> 21376 bytes | |||
-rw-r--r-- | examples/server/server.cpp | 86 | ||||
-rw-r--r-- | examples/server/tests/features/passkey.feature | 1 | ||||
-rw-r--r-- | examples/server/tests/features/server.feature | 2 | ||||
-rw-r--r-- | examples/server/tests/features/steps/steps.py | 100 | ||||
-rw-r--r-- | examples/server/tests/requirements.txt | 4 | ||||
-rw-r--r-- | examples/server/themes/buttons-top/index.html | 1 | ||||
-rw-r--r-- | examples/server/themes/wild/index.html | 1 | ||||
-rw-r--r-- | examples/server/utils.hpp | 43 |
18 files changed, 969 insertions, 262 deletions
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 8365f951..dbe41f1f 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,7 +1,14 @@ set(TARGET llama-server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) +option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) + include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + +if (MINGW) + # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006 + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) +endif() + set(TARGET_SRCS server.cpp utils.hpp @@ -24,6 +31,7 @@ set(PUBLIC_ASSETS prompt-formats.js json-schema-to-grammar.mjs ) + foreach(asset ${PUBLIC_ASSETS}) set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}") set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") @@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS}) COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" ) endforeach() + add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> ) + target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) + if (LLAMA_SERVER_SSL) find_package(OpenSSL REQUIRED) target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto) target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT) endif() + if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() + target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/server/README.md b/examples/server/README.md index e7fb0bf6..33a2b95c 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** - * LLM inference of F16 and quantum models on GPU and CPU + * LLM inference of F16 and quantized models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes * Parallel decoding with multi-user support * Continuous batching @@ -15,68 +15,281 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). -**Command line options:** - -- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response. -- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores). -- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`. -- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` -- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). -- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused -- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused -- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused -- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. -- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`. -- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. -- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048` -- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512` -- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. -- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. -- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems -- `--numa distribute`: Spread execution evenly over all nodes -- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on -- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437 -- `--numa`: Attempt optimizations that may help on some NUMA systems. -- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. -- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. -- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600` -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1` -- `--port`: Set the port to listen. Default: `8080` -- `--path`: Path from which to serve static files. Default: disabled -- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. -- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. -- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled -- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error. -- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled -- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) -- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. -- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled. -- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512` -- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1` -- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. -- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled -- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled. -- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) -- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled -- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json` -- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn` -- `--rope-freq-base N` : RoPE frequency base (default: loaded from model) -- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25) -- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation) -- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0) -- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0) -- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0) -- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls` -- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled) -- `-fa`, `--flash-attn` : enable flash attention (default: disabled). -- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) -- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) - -**If compiled with `LLAMA_SERVER_SSL=ON`** -- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key -- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate +## Usage + +``` +usage: ./llama-server [options] + +general: + + -h, --help, --usage print usage and exit + --version show version and build info + -v, --verbose print verbose information + --verbosity N set specific verbosity level (default: 0) + --verbose-prompt print a verbose prompt before generation (default: false) + --no-display-prompt don't print prompt at generation (default: false) + -co, --color colorise output to distinguish prompt and user input from generations (default: false) + -s, --seed SEED RNG seed (default: -1, use random seed for < 0) + -t, --threads N number of threads to use during generation (default: 8) + -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) + -td, --threads-draft N number of threads to use during generation (default: same as --threads) + -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft) + --draft N number of tokens to draft for speculative decoding (default: 5) + -ps, --p-split N speculative decoding split probability (default: 0.1) + -lcs, --lookup-cache-static FNAME + path to static lookup cache to use for lookup decoding (not updated by generation) + -lcd, --lookup-cache-dynamic FNAME + path to dynamic lookup cache to use for lookup decoding (updated by generation) + -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) + -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) + -b, --batch-size N logical maximum batch size (default: 2048) + -ub, --ubatch-size N physical maximum batch size (default: 512) + --keep N number of tokens to keep from the initial prompt (default: 0, -1 = all) + --chunks N max number of chunks to process (default: -1, -1 = all) + -fa, --flash-attn enable Flash Attention (default: disabled) + -p, --prompt PROMPT prompt to start generation with + in conversation mode, this will be used as system prompt + (default: '') + -f, --file FNAME a file containing the prompt (default: none) + --in-file FNAME an input file (repeat to specify multiple files) + -bf, --binary-file FNAME binary file containing the prompt (default: none) + -e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true) + --no-escape do not process escape sequences + -ptc, --print-token-count N print token count every N tokens (default: -1) + --prompt-cache FNAME file to cache prompt state for faster startup (default: none) + --prompt-cache-all if specified, saves user input and generations to cache as well + not supported with --interactive or other interactive options + --prompt-cache-ro if specified, uses the prompt cache but does not update it + -r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode + can be specified more than once for multiple prompts + -sp, --special special tokens output enabled (default: false) + -cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix + if suffix/prefix are not specified, default chat template will be used + (default: false) + -i, --interactive run in interactive mode (default: false) + -if, --interactive-first run in interactive mode and wait for input right away (default: false) + -mli, --multiline-input allows you to write or paste multiple lines without ending each in '\' + --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string + --in-prefix STRING string to prefix user inputs with (default: empty) + --in-suffix STRING string to suffix after user inputs with (default: empty) + --spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) + +sampling: + + --samplers SAMPLERS samplers that will be used for generation in the order, separated by ';' + (default: top_k;tfs_z;typical_p;top_p;min_p;temperature) + --sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt) + --ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf) + --penalize-nl penalize newline tokens (default: false) + --temp N temperature (default: 0.8) + --top-k N top-k sampling (default: 40, 0 = disabled) + --top-p N top-p sampling (default: 0.9, 1.0 = disabled) + --min-p N min-p sampling (default: 0.1, 0.0 = disabled) + --tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled) + --typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) + --repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) + --repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) + --presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled) + --frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) + --dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled) + --dynatemp-exp N dynamic temperature exponent (default: 1.0) + --mirostat N use Mirostat sampling. + Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used. + (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) + --mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1) + --mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0) + -l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion, + i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', + or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' + --cfg-negative-prompt PROMPT + negative prompt to use for guidance (default: '') + --cfg-negative-prompt-file FNAME + negative prompt file to use for guidance + --cfg-scale N strength of guidance (default: 1.0, 1.0 = disable) + --chat-template JINJA_TEMPLATE + set custom jinja chat template (default: template taken from model's metadata) + if suffix/prefix are specified, template will be disabled + only commonly used templates are accepted: + https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + +grammar: + + --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') + --grammar-file FNAME file to read grammar from + -j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object + For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead + +embedding: + + --pooling {none,mean,cls,last} + pooling type for embeddings, use model default if unspecified + --attention {causal,non-causal} + attention type for embeddings, use model default if unspecified + +context hacking: + + --rope-scaling {none,linear,yarn} + RoPE frequency scaling method, defaults to linear unless specified by the model + --rope-scale N RoPE context scaling factor, expands context by a factor of N + --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model) + --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N + --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size) + --yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) + --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0) + --yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0) + --yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0) + -gan, --grp-attn-n N group-attention factor (default: 1) + -gaw, --grp-attn-w N group-attention width (default: 512.0) + -dkvc, --dump-kv-cache verbose print of the KV cache + -nkvo, --no-kv-offload disable KV offload + -ctk, --cache-type-k TYPE KV cache data type for K (default: f16) + -ctv, --cache-type-v TYPE KV cache data type for V (default: f16) + +perplexity: + + --all-logits return logits for all tokens in the batch (default: false) + --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f + --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400) + --winogrande compute Winogrande score over random tasks from datafile supplied with -f + --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0) + --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f + --multiple-choice-tasks N + number of tasks to use when computing the multiple choice score (default: 0) + --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base + --ppl-stride N stride for perplexity calculation (default: 0) + --ppl-output-type {0,1} output type for perplexity calculation (default: 0) + +parallel: + + -dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled) + -np, --parallel N number of parallel sequences to decode (default: 1) + -ns, --sequences N number of sequences to decode (default: 1) + -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled) + +multi-modality: + + --mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md + --image FILE path to an image file. use with multimodal models. Specify multiple times for batching + +backend: + + --rpc SERVERS comma separated list of RPC servers + --mlock force system to keep model in RAM rather than swapping or compressing + --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock) + --numa TYPE attempt optimizations that help on some NUMA systems + - distribute: spread execution evenly over all nodes + - isolate: only spawn threads on CPUs on the node that execution started on + - numactl: use the CPU map provided by numactl + if run without this previously, it is recommended to drop the system page cache before using this + see https://github.com/ggerganov/llama.cpp/issues/1437 + +model: + + --check-tensors check model tensor data for invalid values (default: false) + --override-kv KEY=TYPE:VALUE + advanced option to override model metadata by key. may be specified multiple times. + types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false + --lora FNAME apply LoRA adapter (implies --no-mmap) + --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap) + --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter + --control-vector FNAME add a control vector + note: this argument can be repeated to add multiple control vectors + --control-vector-scaled FNAME SCALE + add a control vector with user defined scaling SCALE + note: this argument can be repeated to add multiple scaled control vectors + --control-vector-layer-range START END + layer range to apply the control vector(s) to, start and end inclusive + -m, --model FNAME model path (default: models/$filename with filename from --hf-file + or --model-url if set, otherwise models/7B/ggml-model-f16.gguf) + -md, --model-draft FNAME draft model for speculative decoding (default: unused) + -mu, --model-url MODEL_URL model download url (default: unused) + -hfr, --hf-repo REPO Hugging Face model repository (default: unused) + -hff, --hf-file FILE Hugging Face model file (default: unused) + -hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable) + +retrieval: + + --context-file FNAME file to load context from (repeat to specify multiple files) + --chunk-size N minimum length of embedded text chunks (default: 64) + --chunk-separator STRING + separator between chunks (default: ' + ') + +passkey: + + --junk N number of times to repeat the junk text (default: 250) + --pos N position of the passkey in the junk text (default: -1) + +imatrix: + + -o, --output FNAME output file (default: 'imatrix.dat') + --output-frequency N output the imatrix every N iterations (default: 10) + --save-frequency N save an imatrix copy every N iterations (default: 0) + --process-output collect data for the output tensor (default: false) + --no-ppl do not compute perplexity (default: true) + --chunk N start processing the input from chunk N (default: 0) + +bench: + + -pps is the prompt shared across parallel sequences (default: false) + -npp n0,n1,... number of prompt tokens + -ntg n0,n1,... number of text generation tokens + -npl n0,n1,... number of parallel prompts + +embedding: + + --embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + --embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix + --embd-separator separator of embendings (default \n) for example "<#sep#>" + +server: + + --host HOST ip address to listen (default: 127.0.0.1) + --port PORT port to listen (default: 8080) + --path PATH path to serve static files from (default: ) + --embedding(s) enable embedding endpoint (default: disabled) + --api-key KEY API key to use for authentication (default: none) + --api-key-file FNAME path to file containing API keys (default: none) + --ssl-key-file FNAME path to file a PEM-encoded SSL private key + --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate + --timeout N server read/write timeout in seconds (default: 600) + --threads-http N number of threads used to process HTTP requests (default: -1) + --system-prompt-file FNAME + set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications + --log-format {text,json} + log output format: json or text (default: json) + --metrics enable prometheus compatible metrics endpoint (default: disabled) + --no-slots disables slots monitoring endpoint (default: enabled) + --slot-save-path PATH path to save slot kv cache (default: disabled) + --chat-template JINJA_TEMPLATE + set custom jinja chat template (default: template taken from model's metadata) + only commonly used templates are accepted: + https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + -sps, --slot-prompt-similarity SIMILARITY + how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) + + +logging: + + --simple-io use basic IO for better compatibility in subprocesses and limited consoles + -ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset) + --log-test Run simple logging test + --log-disable Disable trace logs + --log-enable Enable trace logs + --log-file FNAME Specify a log filename (without extension) + --log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log" + --log-append Don't truncate the old log file. + +cvector: + + -o, --output FNAME output file (default: 'control_vector.gguf') + --positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt') + --negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt') + --pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100) + --pca-iter N number of iterations used for PCA (default: 1000) + --method {pca,mean} dimensionality reduction method to be used (default: pca) +``` + ## Build @@ -231,7 +444,7 @@ node index.js `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. - `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. + `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. @@ -365,7 +578,8 @@ Notice that each `probs` is an array of length `n_probs`. "assistant_name": "", "user_name": "", "default_generation_settings": { ... }, - "total_slots": 1 + "total_slots": 1, + "chat_template": "" } ``` @@ -373,8 +587,9 @@ Notice that each `probs` is an array of length `n_probs`. - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) +- `chat_template` - the model's original Jinja2 prompt template -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. *Options:* diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 4fbbb203..2daac088 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import argparse import json import os @@ -59,10 +61,11 @@ def main(args_in: list[str] | None = None) -> None: sys.exit(1) # start the benchmark + iterations = 0 + data = {} try: start_benchmark(args) - iterations = 0 with open("results.github.env", 'w') as github_env: # parse output with open('k6-results.json', 'r') as bench_results: @@ -129,7 +132,7 @@ def main(args_in: list[str] | None = None) -> None: timestamps, metric_values = zip(*values) metric_values = [float(value) for value in metric_values] prometheus_metrics[metric] = metric_values - timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps] + timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps] plt.figure(figsize=(16, 10), dpi=80) plt.plot(timestamps_dt, metric_values, label=metric) plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) @@ -156,7 +159,7 @@ def main(args_in: list[str] | None = None) -> None: plt.close() # Mermaid format in case images upload failed - with (open(f"{metric}.mermaid", 'w') as mermaid_f): + with open(f"{metric}.mermaid", 'w') as mermaid_f: mermaid = ( f"""--- config: @@ -278,7 +281,7 @@ def start_server_background(args): } server_process = subprocess.Popen( args, - **pkwargs) + **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] def server_log(in_stream, out_stream): for line in iter(in_stream.readline, b''): diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index 987b9a3b..36818f76 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -21,7 +21,7 @@ let generation_settings = null; // export async function* llama(prompt, params = {}, config = {}) { let controller = config.controller; - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; if (!controller) { controller = new AbortController(); @@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => { // Get the model info from the server. This is useful for getting the context window and so on. export const llamaModelInfo = async (config = {}) => { if (!generation_settings) { - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; const props = await fetch(`${api_url}/props`).then(r => r.json()); generation_settings = props.default_generation_settings; } diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index 19c9f643..c87dd8f1 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -14,10 +14,10 @@ <script type="module"> import { html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component - } from '/index.js'; + } from './index.js'; - import { llama } from '/completion.js'; - import { SchemaConverter } from '/json-schema-to-grammar.mjs'; + import { llama } from './completion.js'; + import { SchemaConverter } from './json-schema-to-grammar.mjs'; import { promptFormats } from './prompt-formats.js'; import { systemPrompts } from './system-prompts.js'; // multilingual is wip let selected_image = false; @@ -225,7 +225,7 @@ throw new Error("already running"); } controller.value = new AbortController(); - for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) { + for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) { const data = chunk.data; if (data.stop) { while ( @@ -634,12 +634,12 @@ return html` <div> <div class="grammar"> <label for="template"></label> - <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/> + <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/> </div> <div class="grammar-columns"> <div class="json-schema-controls"> <input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} /> - <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button> + <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button> </div> </div> </div> diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 2f60a76e..07fec6a3 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -1,5 +1,4 @@ <html> - <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" /> @@ -132,12 +131,20 @@ align-items: stretch; } - .right { + .message-controls { display: flex; - flex-direction: row; - gap: 0.5em; justify-content: flex-end; } + .message-controls > div:nth-child(2) { + display: flex; + flex-direction: column; + gap: 0.5em; + } + .message-controls > div:nth-child(2) > div { + display: flex; + margin-left: auto; + gap: 0.5em; + } fieldset { border: none; @@ -276,6 +283,7 @@ import { llama } from './completion.js'; import { SchemaConverter } from './json-schema-to-grammar.mjs'; + let selected_image = false; var slot_id = -1; @@ -447,6 +455,9 @@ /* END: Support for storing prompt templates and parameters in browsers LocalStorage */ + const tts = window.speechSynthesis; + const ttsVoice = signal(null) + const llamaStats = signal(null) const controller = signal(null) @@ -479,7 +490,7 @@ throw new Error("already running"); } controller.value = new AbortController(); - for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) { + for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) { const data = chunk.data; if (data.stop) { @@ -596,8 +607,51 @@ }); } + const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; + const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null; function MessageInput() { - const message = useSignal("") + const message = useSignal(""); + + const talkActive = useSignal(false); + const sendOnTalk = useSignal(false); + const talkStop = (e) => { + if (e) e.preventDefault(); + + talkActive.value = false; + talkRecognition?.stop(); + } + const talk = (e) => { + e.preventDefault(); + + if (talkRecognition) + talkRecognition.start(); + else + alert("Speech recognition is not supported by this browser."); + } + if(talkRecognition) { + talkRecognition.onstart = () => { + talkActive.value = true; + } + talkRecognition.onresult = (e) => { + if (event.results.length > 0) { + message.value = event.results[0][0].transcript; + if (sendOnTalk.value) { + submit(e); + } + } + } + talkRecognition.onspeechend = () => { + talkStop(); + } + } + + const ttsVoices = useSignal(tts?.getVoices() || []); + const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default)); + if (tts) { + tts.onvoiceschanged = () => { + ttsVoices.value = tts.getVoices(); + } + } const submit = (e) => { stop(e); @@ -624,11 +678,45 @@ value="${message}" /> </div> - <div class="right"> - <button type="submit" disabled=${generating.value}>Send</button> - <button onclick=${uploadImage}>Upload Image</button> - <button onclick=${stop} disabled=${!generating.value}>Stop</button> - <button onclick=${reset}>Reset</button> + <div class="message-controls"> + <div> </div> + <div> + <div> + <button type="submit" disabled=${generating.value || talkActive.value}>Send</button> + <button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button> + <button onclick=${stop} disabled=${!generating.value}>Stop</button> + <button onclick=${reset}>Reset</button> + </div> + <div> + <a href="#" style="cursor: help;" title="Help" onclick=${e => { + e.preventDefault(); + alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` + + `(TTS and speech recognition are not provided by llama.cpp)\n` + + `Note: STT requires HTTPS to work.`); + }}>[?]</a> + <button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button> + <div> + <input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} /> + <label for="send-on-talk" style="line-height: initial;">Send after talking</label> + </div> + </div> + <div> + <a href="#" style="cursor: help;" title="Help" onclick=${e => { + e.preventDefault(); + alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`); + }}>[?]</a> + <label for="tts-voices" style="line-height: initial;">Bot Voice:</label> + <select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;"> + <option value="" selected="${!ttsVoice.value}">None</option> + ${[ + ...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []), + ...ttsVoices.value.filter(v => !v.default), + ].map( + v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>` + )} + </select> + </div> + </div> </div> </form> ` @@ -659,26 +747,86 @@ } }, [messages]) + const ttsChatLineActiveIx = useSignal(undefined); + const ttsChatLine = (e, ix, msg) => { + if (e) e.preventDefault(); + + if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return; + + const ttsVoices = tts.getVoices(); + const voice = ttsVoices.find(v => v.name === ttsVoice.value); + if (!voice) return; + + if (ttsChatLineActiveIx.value !== undefined) { + tts.cancel(); + if (ttsChatLineActiveIx.value === ix) { + ttsChatLineActiveIx.value = undefined; + return; + } + } + + ttsChatLineActiveIx.value = ix; + let ttsUtter = new SpeechSynthesisUtterance(msg); + ttsUtter.voice = voice; + ttsUtter.onend = e => { + ttsChatLineActiveIx.value = undefined; + }; + tts.speak(ttsUtter); + } + const isCompletionMode = session.value.type === 'completion' + + // Try play the last bot message + const lastCharChatLinesIxs = useSignal([]); + const lastCharChatLinesIxsOld = useSignal([]); + useEffect(() => { + if ( + !isCompletionMode + && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length + && !generating.value + ) { + const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1]; + if (ix !== undefined) { + const msg = messages[ix]; + ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg); + } + + lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value); + } + }, [generating.value]); + const chatLine = ([user, data], index) => { let message - const isArrayMessage = Array.isArray(data) + const isArrayMessage = Array.isArray(data); + const text = isArrayMessage ? + data.map(msg => msg.content).join('') : + data; if (params.value.n_probs > 0 && isArrayMessage) { message = html`<${Probabilities} data=${data} />` } else { - const text = isArrayMessage ? - data.map(msg => msg.content).join('') : - data; message = isCompletionMode ? text : html`<${Markdownish} text=${template(text)} />` } + + const fromBot = user && user === '{{char}}'; + if (fromBot && !lastCharChatLinesIxs.value.includes(index)) + lastCharChatLinesIxs.value.push(index); + if (user) { - return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>` + return html` + <div> + <p key=${index}><strong>${template(user)}:</strong> ${message}</p> + ${ + fromBot && ttsVoice.value + && html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>` + } + </div> + `; } else { return isCompletionMode ? html`<span key=${index}>${message}</span>` : - html`<p key=${index}>${message}</p>` + html`<div><p key=${index}>${message}</p></div>` } }; diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index faed6a32..7267f3f9 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) { return minItems === 0 ? `(${result})?` : result; } +function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) { + const hasMin = minValue !== null; + const hasMax = maxValue !== null; + + function digitRange(fromChar, toChar) { + out.push("["); + if (fromChar === toChar) { + out.push(fromChar); + } else { + out.push(fromChar); + out.push("-"); + out.push(toChar); + } + out.push("]"); + } + + function moreDigits(minDigits, maxDigits) { + out.push("[0-9]"); + if (minDigits === maxDigits && minDigits === 1) { + return; + } + out.push("{"); + out.push(minDigits.toString()); + if (maxDigits !== minDigits) { + out.push(","); + if (maxDigits !== Number.MAX_SAFE_INTEGER) { + out.push(maxDigits.toString()); + } + } + out.push("}"); + } + + function uniformRange(fromStr, toStr) { + let i = 0; + while (i < fromStr.length && fromStr[i] === toStr[i]) { + i++; + } + if (i > 0) { + out.push("\""); + out.push(fromStr.slice(0, i)); + out.push("\""); + } + if (i < fromStr.length) { + if (i > 0) { + out.push(" "); + } + const subLen = fromStr.length - i - 1; + if (subLen > 0) { + const fromSub = fromStr.slice(i + 1); + const toSub = toStr.slice(i + 1); + const subZeros = "0".repeat(subLen); + const subNines = "9".repeat(subLen); + + let toReached = false; + out.push("("); + if (fromSub === subZeros) { + digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1)); + out.push(" "); + moreDigits(subLen, subLen); + } else { + out.push("["); + out.push(fromStr[i]); + out.push("] "); + out.push("("); + uniformRange(fromSub, subNines); + out.push(")"); + if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) { + out.push(" | "); + if (toSub === subNines) { + digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]); + toReached = true; + } else { + digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1)); + } + out.push(" "); + moreDigits(subLen, subLen); + } + } + if (!toReached) { + out.push(" | "); + digitRange(toStr[i], toStr[i]); + out.push(" "); + uniformRange(subZeros, toSub); + } + out.push(")"); + } else { + out.push("["); + out.push(fromStr[i]); + out.push("-"); + out.push(toStr[i]); + out.push("]"); + } + } + } + + if (hasMin && hasMax) { + if (minValue < 0 && maxValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true); + out.push(")"); + return; + } + + if (minValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(0, -minValue, out, decimalsLeft, true); + out.push(") | "); + minValue = 0; + } + + let minS = minValue.toString(); + const maxS = maxValue.toString(); + const minDigits = minS.length; + const maxDigits = maxS.length; + + for (let digits = minDigits; digits < maxDigits; digits++) { + uniformRange(minS, "9".repeat(digits)); + minS = "1" + "0".repeat(digits); + out.push(" | "); + } + uniformRange(minS, maxS); + return; + } + + const lessDecimals = Math.max(decimalsLeft - 1, 1); + + if (hasMin) { + if (minValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(null, -minValue, out, decimalsLeft, false); + out.push(") | [0] | [1-9] "); + moreDigits(0, decimalsLeft - 1); + } else if (minValue === 0) { + if (topLevel) { + out.push("[0] | [1-9] "); + moreDigits(0, lessDecimals); + } else { + moreDigits(1, decimalsLeft); + } + } else if (minValue <= 9) { + const c = minValue.toString(); + const range_start = topLevel ? '1' : '0'; + if (c > range_start) { + digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1)); + out.push(" "); + moreDigits(1, lessDecimals); + out.push(" | "); + } + digitRange(c, "9"); + out.push(" "); + moreDigits(0, lessDecimals); + } else { + const minS = minValue.toString(); + const length = minS.length; + const c = minS[0]; + + if (c > "1") { + digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1)); + out.push(" "); + moreDigits(length, lessDecimals); + out.push(" | "); + } + digitRange(c, c); + out.push(" ("); + _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false); + out.push(")"); + if (c < "9") { + out.push(" | "); + digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9"); + out.push(" "); + moreDigits(length - 1, lessDecimals); + } + } + return; + } + + if (hasMax) { + if (maxValue >= 0) { + if (topLevel) { + out.push("\"-\" [1-9] "); + moreDigits(0, lessDecimals); + out.push(" | "); + } + _generateMinMaxInt(0, maxValue, out, decimalsLeft, true); + } else { + out.push("\"-\" ("); + _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false); + out.push(")"); + } + return; + } + + throw new Error("At least one of minValue or maxValue must be set"); +} + class BuiltinRule { constructor(content, deps) { this.content = content; @@ -64,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g; const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' }; const NON_LITERAL_SET = new Set('|.()[]{}*+?'); -const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?'); +const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?'); export class SchemaConverter { constructor(options) { @@ -337,6 +532,64 @@ export class SchemaConverter { return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") } + _notStrings(strings) { + class TrieNode { + constructor() { + this.children = {}; + this.isEndOfString = false; + } + + insert(str) { + let node = this; + for (const c of str) { + node = node.children[c] = node.children[c] || new TrieNode(); + } + node.isEndOfString = true; + } + } + + const trie = new TrieNode(); + for (const s of strings) { + trie.insert(s); + } + + const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']); + const out = ['["] ( ']; + + const visit = (node) => { + const rejects = []; + let first = true; + for (const c of Object.keys(node.children).sort()) { + const child = node.children[c]; + rejects.push(c); + if (first) { + first = false; + } else { + out.push(' | '); + } + out.push(`[${c}]`); + if (Object.keys(child.children).length > 0) { + out.push(' ('); + visit(child); + out.push(')'); + } else if (child.isEndOfString) { + out.push(` ${charRuleName}+`); + } + } + if (Object.keys(node.children).length > 0) { + if (!first) { + out.push(' | '); + } + out.push(`[^"${rejects.join('')}] ${charRuleName}*`); + } + }; + + visit(trie); + + out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`); + return out.join(''); + } + _resolveRef(ref) { let refName = ref.split('/').pop(); if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) { @@ -363,11 +616,11 @@ export class SchemaConverter { } else if (schema.oneOf || schema.anyOf) { return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf)); } else if (Array.isArray(schemaType)) { - return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t })))); + return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t})))); } else if ('const' in schema) { - return this._addRule(ruleName, this._generateConstantRule(schema.const)); + return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space'); } else if ('enum' in schema) { - const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | '); + const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space'; return this._addRule(ruleName, rule); } else if ((schemaType === undefined || schemaType === 'object') && ('properties' in schema || @@ -404,7 +657,7 @@ export class SchemaConverter { } } - return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false)); + return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null)); } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) { const items = schema.items ?? schema.prefixItems; if (Array.isArray(items)) { @@ -435,6 +688,24 @@ export class SchemaConverter { const minLen = schema.minLength || 0; const maxLen = schema.maxLength; return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space'); + } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) { + let minValue = null; + let maxValue = null; + if ('minimum' in schema) { + minValue = schema.minimum; + } else if ('exclusiveMinimum' in schema) { + minValue = schema.exclusiveMinimum + 1; + } + if ('maximum' in schema) { + maxValue = schema.maximum; + } else if ('exclusiveMaximum' in schema) { + maxValue = schema.exclusiveMaximum - 1; + } + + const out = ["("]; + _generateMinMaxInt(minValue, maxValue, out); + out.push(") space"); + return this._addRule(ruleName, out.join('')); } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) { return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object'])); } else { @@ -480,12 +751,19 @@ export class SchemaConverter { const requiredProps = sortedProps.filter(k => required.has(k)); const optionalProps = sortedProps.filter(k => !required.has(k)); - if (typeof additionalProperties === 'object' || additionalProperties === true) { + if (additionalProperties) { const subName = `${name ?? ''}${name ? '-' : ''}additional`; - const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`); + const valueRule = + additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`) + : this._addPrimitive('value', PRIMITIVE_RULES['value']); + + const key_rule = + sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string']) + : this._addRule(`${subName}-k`, this._notStrings(sortedProps)); + propKvRuleNames['*'] = this._addRule( `${subName}-kv`, - `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`); + `${key_rule} ":" space ${valueRule}`); optionalProps.push('*'); } @@ -502,15 +780,11 @@ export class SchemaConverter { const [k, ...rest] = ks; const kvRuleName = propKvRuleNames[k]; let res; - if (k === '*') { - res = this._addRule( - `${name ?? ''}${name ? '-' : ''}additional-kvs`, - `${kvRuleName} ( "," space ` + kvRuleName + ` )*` - ) - } else if (firstIsOptional) { - res = `( "," space ${kvRuleName} )?`; + const commaRef = `( "," space ${kvRuleName} )`; + if (firstIsOptional) { + res = commaRef + (k === '*' ? '*' : '?'); } else { - res = kvRuleName; + res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : ''); } if (rest.length > 0) { res += ' ' + this._addRule( diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md index 2dc17782..21410199 100644 --- a/examples/server/public_simplechat/readme.md +++ b/examples/server/public_simplechat/readme.md @@ -3,6 +3,13 @@ by Humans for All. +## quickstart + +To run from the build dir + +bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat + +Continue reading for the details. ## overview @@ -14,6 +21,8 @@ own system prompts. This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated, or potentially as it is being generated, in a streamed manner from the server/ai-model. + + Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you open SimpleChat, option is provided to restore the old chat session, if a matching one exists. @@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t The histogram/freq based trimming logic is currently tuned for english language wrt its is-it-a-alpabetic|numeral-char regex match logic. - chatRequestOptions - maintains the list of options/fields to send along with chat request, + apiRequestOptions - maintains the list of options/fields to send along with api request, irrespective of whether /chat/completions or /completions endpoint. If you want to add additional options/fields to send to the server/ai-model, and or modify the existing options value or remove them, for now you can update this global var using browser's development-tools/console. - For string and numeric fields in chatRequestOptions, including even those added by a user - at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto + For string, numeric and boolean fields in apiRequestOptions, including even those added by a + user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto created. + cache_prompt option supported by example/server is allowed to be controlled by user, so that + any caching supported wrt system-prompt and chat history, if usable can get used. When chat + history sliding window is enabled, cache_prompt logic may or may not kick in at the backend + wrt same, based on aspects related to model, positional encoding, attention mechanism etal. + However system prompt should ideally get the benefit of caching. + headers - maintains the list of http headers sent when request is made to the server. By default Content-Type is set to application/json. Additionally Authorization entry is provided, which can be set if needed using the settings ui. @@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t >0 : Send the latest chat history from the latest system prompt, limited to specified cnt. -By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the -implications of loading of the ai-model's context window by chat history, wrt chat response to -some extent in a simple crude way. You may also want to control the context size enabled when -the server loads ai-model, on the server end. +By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control +the implications of loading of the ai-model's context window by chat history, wrt chat response to +some extent in a simple crude way. You may also want to control the context size enabled when the +server loads ai-model, on the server end. Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js @@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side. internal n_predict, for now add the same here on the client side, maybe later add max_tokens to /completions endpoint handling code on server side. -NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions -wrt the set of fields sent to server along with the user query. To check how the model behaves +NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions +wrt the set of fields sent to server along with the user query, to check how the model behaves wrt repeatations in general in the generated text response. A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by -using the providing settings ui. +using the provided settings ui (for settings exposed through the ui). ### OpenAi / Equivalent API WebService @@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below. * the baseUrl in settings ui * https://api.openai.com/v1 or similar -* Wrt request body - gMe.chatRequestOptions +* Wrt request body - gMe.apiRequestOptions * model (settings ui) * any additional fields if required in future diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js index 25afb256..8e0df3b6 100644 --- a/examples/server/public_simplechat/simplechat.js +++ b/examples/server/public_simplechat/simplechat.js @@ -222,8 +222,8 @@ class SimpleChat { * @param {Object} obj */ request_jsonstr_extend(obj) { - for(let k in gMe.chatRequestOptions) { - obj[k] = gMe.chatRequestOptions[k]; + for(let k in gMe.apiRequestOptions) { + obj[k] = gMe.apiRequestOptions[k]; } if (gMe.bStream) { obj["stream"] = true; @@ -740,11 +740,12 @@ class Me { "Authorization": "", // Authorization: Bearer OPENAI_API_KEY } // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint. - this.chatRequestOptions = { + this.apiRequestOptions = { "model": "gpt-3.5-turbo", "temperature": 0.7, "max_tokens": 1024, "n_predict": 1024, + "cache_prompt": false, //"frequency_penalty": 1.2, //"presence_penalty": 1.2, }; @@ -800,51 +801,55 @@ class Me { ui.el_create_append_p(`bStream:${this.bStream}`, elDiv); - ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv); - - ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv); - ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv); + ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv); + ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv); - ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv); + ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv); + + ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv); } - ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv); + ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv); ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv); } /** - * Auto create ui input elements for fields in ChatRequestOptions + * Auto create ui input elements for fields in apiRequestOptions * Currently supports text and number field types. * @param {HTMLDivElement} elDiv */ - show_settings_chatrequestoptions(elDiv) { + show_settings_apirequestoptions(elDiv) { let typeDict = { "string": "text", "number": "number", }; let fs = document.createElement("fieldset"); let legend = document.createElement("legend"); - legend.innerText = "ChatRequestOptions"; + legend.innerText = "ApiRequestOptions"; fs.appendChild(legend); elDiv.appendChild(fs); - for(const k in this.chatRequestOptions) { - let val = this.chatRequestOptions[k]; + for(const k in this.apiRequestOptions) { + let val = this.apiRequestOptions[k]; let type = typeof(val); - if (!((type == "string") || (type == "number"))) { - continue; + if (((type == "string") || (type == "number"))) { + let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{ + if (type == "number") { + val = Number(val); + } + this.apiRequestOptions[k] = val; + }); + fs.appendChild(inp.div); + } else if (type == "boolean") { + let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{ + this.apiRequestOptions[k] = userVal; + }); + fs.appendChild(bbtn.div); } - let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{ - if (type == "number") { - val = Number(val); - } - this.chatRequestOptions[k] = val; - }); - fs.appendChild(inp.div); } } @@ -870,32 +875,32 @@ class Me { }); elDiv.appendChild(bb.div); - bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{ - this.bCompletionFreshChatAlways = val; + bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{ + this.bTrimGarbage = val; }); elDiv.appendChild(bb.div); - bb = ui.el_creatediv_boolbutton("SetCompletionInsertStandardRolePrefix", "CompletionInsertStandardRolePrefix", {true: "[+] yes insert", false: "[-] dont insert"}, this.bCompletionInsertStandardRolePrefix, (val)=>{ - this.bCompletionInsertStandardRolePrefix = val; - }); - elDiv.appendChild(bb.div); + this.show_settings_apirequestoptions(elDiv); - bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{ - this.bTrimGarbage = val; + let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{ + this.apiEP = ApiEP.Type[val]; }); - elDiv.appendChild(bb.div); + elDiv.appendChild(sel.div); - let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{ + sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{ this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val]; }); elDiv.appendChild(sel.div); - sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{ - this.apiEP = ApiEP.Type[val]; + bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{ + this.bCompletionFreshChatAlways = val; }); - elDiv.appendChild(sel.div); + elDiv.appendChild(bb.div); - this.show_settings_chatrequestoptions(elDiv); + bb = ui.el_creatediv_boolbutton("SetCompletionInsertStandardRolePrefix", "CompletionInsertStandardRolePrefix", {true: "[+] yes insert", false: "[-] dont insert"}, this.bCompletionInsertStandardRolePrefix, (val)=>{ + this.bCompletionInsertStandardRolePrefix = val; + }); + elDiv.appendChild(bb.div); } diff --git a/examples/server/public_simplechat/simplechat_screens.webp b/examples/server/public_simplechat/simplechat_screens.webp Binary files differnew file mode 100644 index 00000000..ccea4439 --- /dev/null +++ b/examples/server/public_simplechat/simplechat_screens.webp diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f9a86961..7813a295 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -737,6 +737,8 @@ struct server_context { slot.ga_n = ga_n; slot.ga_w = ga_w; + slot.sparams = params.sparams; + slot.reset(); slots.push_back(slot); @@ -884,7 +886,8 @@ struct server_context { bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot_params default_params; - llama_sampling_params default_sparams; + // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) + llama_sampling_params default_sparams = params.sparams; auto & data = task.data; if (data.count("__oaicompat") != 0) { @@ -1179,7 +1182,7 @@ struct server_context { bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok, false); + const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special); slot.sampled = result.tok; // search stop word and delete it @@ -2002,6 +2005,11 @@ struct server_context { int32_t n_batch = llama_n_batch(ctx); int32_t n_ubatch = llama_n_ubatch(ctx); + // track if this is an embedding or non-embedding batch + // if we've added sampled tokens above, we are in non-embedding mode + // -1: none, 0: non-embedding, 1: embedding + int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; + // next, batch any pending prompts without exceeding n_batch if (params.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { @@ -2020,6 +2028,7 @@ struct server_context { slot.t_start_generation = 0; if (slot.infill) { + const bool add_bos = llama_should_add_bos_token(model); bool suff_rm_leading_spc = true; if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); @@ -2035,16 +2044,21 @@ struct server_context { } prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + + auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { - prefix_tokens.push_back(middle_token); + embd_inp.push_back(middle_token); } - prompt_tokens = prefix_tokens; + prompt_tokens = embd_inp; } else { prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt } @@ -2166,6 +2180,14 @@ struct server_context { } } + // check that we are in the right batch_type, if not defer the slot + bool slot_type = slot.embedding ? 1 : 0; + if (batch_type == -1) { + batch_type = slot_type; + } else if (batch_type != slot_type) { + continue; + } + // keep only the common part int p0 = (int) system_tokens.size() + slot.n_past; if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { @@ -2267,6 +2289,9 @@ struct server_context { {"n_tokens", batch.n_tokens}, }); + // make sure we're in the right embedding mode + llama_set_embeddings(ctx, batch_type == 1); + // process the created batch of tokens for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); @@ -2599,24 +2624,16 @@ int main(int argc, char ** argv) { // if a custom chat template is not supplied, we will use the one that comes with the model (if any) if (params.chat_template.empty()) { if (!ctx_server.validate_model_chat_template()) { - LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); + LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); params.chat_template = "chatml"; } } // print sample chat example to make it clear which template is used { - json chat; - chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}}); - chat.push_back({{"role", "user"}, {"content", "Hello"}}); - chat.push_back({{"role", "assistant"}, {"content", "Hi there"}}); - chat.push_back({{"role", "user"}, {"content", "How are you?"}}); - - const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat); - LOG_INFO("chat template", { - {"chat_example", chat_example}, - {"built_in", params.chat_template.empty()}, + {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, + {"built_in", params.chat_template.empty()}, }); } @@ -2969,17 +2986,31 @@ int main(int argc, char ** argv) { }; const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + std::string template_key = "tokenizer.chat_template", curr_tmpl; + int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0); + if (tlen > 0) { + std::vector<char> curr_tmpl_buf(tlen + 1, 0); + if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) { + curr_tmpl = std::string(curr_tmpl_buf.data(), tlen); + } + } res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { { "system_prompt", ctx_server.system_prompt.c_str() }, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, - { "total_slots", ctx_server.params.n_parallel } + { "total_slots", ctx_server.params.n_parallel }, + { "chat_template", curr_tmpl.c_str() } }; res.set_content(data.dump(), "application/json; charset=utf-8"); }; const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { + if (ctx_server.params.embedding) { + res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = json::parse(req.body); @@ -3075,6 +3106,11 @@ int main(int argc, char ** argv) { }; const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { + if (ctx_server.params.embedding) { + res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); @@ -3147,6 +3183,11 @@ int main(int argc, char ** argv) { }; const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { + if (ctx_server.params.embedding) { + res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = json::parse(req.body); @@ -3233,13 +3274,8 @@ int main(int argc, char ** argv) { return res.set_content(data.dump(), "application/json; charset=utf-8"); }; - const auto handle_embeddings = [¶ms, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { + const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - if (!params.embedding) { - res.status = 501; - res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8"); - return; - } const json body = json::parse(req.body); bool is_openai = false; diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature index 1bde7aab..6a5a84e6 100644 --- a/examples/server/tests/features/passkey.feature +++ b/examples/server/tests/features/passkey.feature @@ -52,4 +52,3 @@ Feature: Passkey / Self-extend with context shift #| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 | #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0 # 987 | - diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index d21c0913..b5597145 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -82,7 +82,7 @@ Feature: llama.cpp server Examples: Prompts | response_format | n_predicted | re_content | - | {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" | + | {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" | | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] | | {"type": "json_object"} | 10 | \{ " Jacky. | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 7b5dabb0..df0814cc 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,5 +1,4 @@ import asyncio -import collections import json import os import re @@ -8,19 +7,23 @@ import subprocess import sys import threading import time +from collections.abc import Sequence from contextlib import closing from re import RegexFlag +from typing import Any, Literal, cast import aiohttp import numpy as np import openai -from behave import step +from openai.types.chat import ChatCompletionChunk +from behave import step # pyright: ignore[reportAttributeAccessIssue] from behave.api.async_step import async_run_until_complete from prometheus_client import parser +# pyright: reportRedeclaration=false @step("a server listening on {server_fqdn}:{server_port}") -def step_server_config(context, server_fqdn, server_port): +def step_server_config(context, server_fqdn: str, server_port: str): context.server_fqdn = server_fqdn context.server_port = int(server_port) context.n_threads = None @@ -74,34 +77,34 @@ def step_server_config(context, server_fqdn, server_port): @step('a model file {hf_file} from HF repo {hf_repo}') -def step_download_hf_model(context, hf_file, hf_repo): +def step_download_hf_model(context, hf_file: str, hf_repo: str): context.model_hf_repo = hf_repo context.model_hf_file = hf_file context.model_file = os.path.basename(hf_file) @step('a model file {model_file}') -def step_model_file(context, model_file): +def step_model_file(context, model_file: str): context.model_file = model_file @step('a model url {model_url}') -def step_model_url(context, model_url): +def step_model_url(context, model_url: str): context.model_url = model_url @step('a model alias {model_alias}') -def step_model_alias(context, model_alias): +def step_model_alias(context, model_alias: str): context.model_alias = model_alias @step('{seed:d} as server seed') -def step_seed(context, seed): +def step_seed(context, seed: int): context.server_seed = seed @step('{ngl:d} GPU offloaded layers') -def step_n_gpu_layer(context, ngl): +def step_n_gpu_layer(context, ngl: int): if 'N_GPU_LAYERS' in os.environ: new_ngl = int(os.environ['N_GPU_LAYERS']) if context.debug: @@ -111,37 +114,37 @@ def step_n_gpu_layer(context, ngl): @step('{n_threads:d} threads') -def step_n_threads(context, n_threads): +def step_n_threads(context, n_threads: int): context.n_thread = n_threads @step('{draft:d} as draft') -def step_draft(context, draft): +def step_draft(context, draft: int): context.draft = draft @step('{n_ctx:d} KV cache size') -def step_n_ctx(context, n_ctx): +def step_n_ctx(context, n_ctx: int): context.n_ctx = n_ctx @step('{n_slots:d} slots') -def step_n_slots(context, n_slots): +def step_n_slots(context, n_slots: int): context.n_slots = n_slots @step('{n_predict:d} server max tokens to predict') -def step_server_n_predict(context, n_predict): +def step_server_n_predict(context, n_predict: int): context.n_server_predict = n_predict @step('{slot_save_path} as slot save path') -def step_slot_save_path(context, slot_save_path): +def step_slot_save_path(context, slot_save_path: str): context.slot_save_path = slot_save_path @step('using slot id {id_slot:d}') -def step_id_slot(context, id_slot): +def step_id_slot(context, id_slot: int): context.id_slot = id_slot @@ -191,7 +194,7 @@ def step_start_server(context): @step("the server is {expecting_status}") @async_run_until_complete -async def step_wait_for_the_server_to_be_started(context, expecting_status): +async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str): match expecting_status: case 'healthy': await wait_for_health_status(context, context.base_url, 200, 'ok', @@ -221,7 +224,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status): @step('all slots are {expected_slot_status_string}') @async_run_until_complete -async def step_all_slots_status(context, expected_slot_status_string): +async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str): match expected_slot_status_string: case 'idle': expected_slot_status = 0 @@ -237,7 +240,7 @@ async def step_all_slots_status(context, expected_slot_status_string): @step('a completion request with {api_error} api error') @async_run_until_complete -async def step_request_completion(context, api_error): +async def step_request_completion(context, api_error: Literal['raised'] | str): expect_api_error = api_error == 'raised' seeds = await completions_seed(context, num_seeds=1) completion = await request_completion(context.prompts.pop(), @@ -777,8 +780,8 @@ def step_assert_metric_value(context, metric_name, metric_value): def step_available_models(context): # openai client always expects an api_key openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope' - openai.api_base = f'{context.base_url}/v1' - context.models = openai.Model.list().data + openai.base_url = f'{context.base_url}/v1/' + context.models = openai.models.list().data @step('{n_model:d} models are supported') @@ -789,7 +792,7 @@ def step_supported_models(context, n_model): @step('model {i_model:d} is {param} {preposition} {param_value}') -def step_supported_models(context, i_model, param, preposition, param_value): +def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str): assert i_model < len(context.models) model = context.models[i_model] @@ -798,7 +801,7 @@ def step_supported_models(context, i_model, param, preposition, param_value): case 'identified': value = model.id case 'trained': - value = str(model.meta.n_ctx_train) + value = str(model.meta["n_ctx_train"]) case _: assert False, "param {param} not supported" assert param_value == value, f"model param {param} {value} != {param_value}" @@ -810,6 +813,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs): print(f"starting {context.n_prompts} concurrent completion requests...") assert context.n_prompts > 0 seeds = await completions_seed(context) + assert seeds is not None for prompt_no in range(context.n_prompts): shifted_args = [context.prompts.pop(), seeds[prompt_no], *args] context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) @@ -861,7 +865,7 @@ async def request_completion(prompt, id_slot=None, expect_api_error=None, user_api_key=None, - temperature=None): + temperature=None) -> int | dict[str, Any]: if debug: print(f"Sending completion request: {prompt}") origin = "my.super.domain" @@ -899,8 +903,8 @@ async def request_completion(prompt, async def oai_chat_completions(user_prompt, seed, system_prompt, - base_url, - base_path, + base_url: str, + base_path: str, async_client, debug=False, temperature=None, @@ -909,7 +913,7 @@ async def oai_chat_completions(user_prompt, enable_streaming=None, response_format=None, user_api_key=None, - expect_api_error=None): + expect_api_error=None) -> int | dict[str, Any]: if debug: print(f"Sending OAI Chat completions request: {user_prompt}") # openai client always expects an api key @@ -989,32 +993,35 @@ async def oai_chat_completions(user_prompt, else: try: openai.api_key = user_api_key - openai.api_base = f'{base_url}{base_path}' - chat_completion = openai.Completion.create( + openai.base_url = f'{base_url}{base_path.removesuffix("chat")}' + assert model is not None + chat_completion = openai.chat.completions.create( messages=payload['messages'], model=model, max_tokens=n_predict, stream=enable_streaming, - response_format=payload.get('response_format'), + response_format=payload.get('response_format') or openai.NOT_GIVEN, seed=seed, temperature=payload['temperature'] ) - except openai.error.AuthenticationError as e: + except openai.AuthenticationError as e: if expect_api_error is not None and expect_api_error: return 401 else: assert False, f'error raised: {e}' if enable_streaming: + chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion) for chunk in chat_completion: assert len(chunk.choices) == 1 delta = chunk.choices[0].delta - if 'content' in delta: - completion_response['content'] += delta['content'] + if delta.content is not None: + completion_response['content'] += delta.content completion_response['timings']['predicted_n'] += 1 completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop' else: assert len(chat_completion.choices) == 1 + assert chat_completion.usage is not None completion_response = { 'content': chat_completion.choices[0].message.content, 'timings': { @@ -1028,7 +1035,7 @@ async def oai_chat_completions(user_prompt, return completion_response -async def request_embedding(content, seed, base_url=None): +async def request_embedding(content, seed, base_url=None) -> list[list[float]]: async with aiohttp.ClientSession() as session: async with session.post(f'{base_url}/embedding', json={ @@ -1041,7 +1048,7 @@ async def request_embedding(content, seed, base_url=None): async def request_oai_embeddings(input, seed, base_url=None, user_api_key=None, - model=None, async_client=False): + model=None, async_client=False) -> list[list[float]]: # openai client always expects an api_key user_api_key = user_api_key if user_api_key is not None else 'nope' if async_client: @@ -1063,7 +1070,7 @@ async def request_oai_embeddings(input, seed, response_json = await response.json() assert response_json['model'] == model, f"invalid model received: {response_json['model']}" assert response_json['object'] == 'list' - if isinstance(input, collections.abc.Sequence): + if isinstance(input, Sequence): embeddings = [] for an_oai_embeddings in response_json['data']: embeddings.append(an_oai_embeddings['embedding']) @@ -1072,19 +1079,14 @@ async def request_oai_embeddings(input, seed, return embeddings else: openai.api_key = user_api_key - openai.api_base = f'{base_url}/v1' - oai_embeddings = openai.Embedding.create( + openai.base_url = f'{base_url}/v1/' + assert model is not None + oai_embeddings = openai.embeddings.create( model=model, input=input, ) - if isinstance(input, collections.abc.Sequence): - embeddings = [] - for an_oai_embeddings in oai_embeddings.data: - embeddings.append(an_oai_embeddings.embedding) - else: - embeddings = [oai_embeddings.data.embedding] - return embeddings + return [e.embedding for e in oai_embeddings.data] def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): @@ -1122,7 +1124,7 @@ def assert_all_predictions_equal(completion_responses): if i == j: continue content_j = response_j['content'] - assert content_i == content_j, "contents not equal" + assert content_i == content_j, "contents not equal" def assert_all_predictions_different(completion_responses): @@ -1136,7 +1138,7 @@ def assert_all_predictions_different(completion_responses): if i == j: continue content_j = response_j['content'] - assert content_i != content_j, "contents not different" + assert content_i != content_j, "contents not different" def assert_all_token_probabilities_equal(completion_responses): @@ -1153,7 +1155,7 @@ def assert_all_token_probabilities_equal(completion_responses): if i == j: continue probs_j = response_j['completion_probabilities'][pos]['probs'] - assert probs_i == probs_j, "contents not equal" + assert probs_i == probs_j, "contents not equal" async def gather_tasks_results(context): @@ -1343,7 +1345,7 @@ def start_server_background(context): } context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], - **pkwargs) + **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] def server_log(in_stream, out_stream): for line in iter(in_stream.readline, b''): diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 2e4f42ad..2c741ea1 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 behave~=1.2.6 huggingface_hub~=0.20.3 -numpy~=1.24.4 -openai~=0.25.0 +numpy~=1.26.4 +openai~=1.30.3 prometheus-client~=0.20.0 diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html index 6af30d30..8334bcde 100644 --- a/examples/server/themes/buttons-top/index.html +++ b/examples/server/themes/buttons-top/index.html @@ -1054,4 +1054,3 @@ </body> </html> - diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html index 772e716c..8361c577 100644 --- a/examples/server/themes/wild/index.html +++ b/examples/server/themes/wild/index.html @@ -1058,4 +1058,3 @@ </body> </html> - diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 63fde9c9..db6b3b74 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -118,36 +118,35 @@ static inline void server_log(const char * level, const char * function, int lin // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) { - size_t alloc_size = 0; - // vector holding all allocated string to be passed to llama_chat_apply_template - std::vector<std::string> str(messages.size() * 2); - std::vector<llama_chat_message> chat(messages.size()); + std::vector<llama_chat_msg> chat; for (size_t i = 0; i < messages.size(); ++i) { const auto & curr_msg = messages[i]; - str[i*2 + 0] = json_value(curr_msg, "role", std::string("")); - str[i*2 + 1] = json_value(curr_msg, "content", std::string("")); - alloc_size += str[i*2 + 1].length(); - chat[i].role = str[i*2 + 0].c_str(); - chat[i].content = str[i*2 + 1].c_str(); - } - - const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); - std::vector<char> buf(alloc_size * 2); - // run the first time to get the total output length - int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); + std::string role = json_value(curr_msg, "role", std::string("")); + + std::string content; + if (curr_msg.contains("content")) { + if (curr_msg["content"].is_string()) { + content = curr_msg["content"].get<std::string>(); + } else if (curr_msg["content"].is_array()) { + for (const auto & part : curr_msg["content"]) { + if (part.contains("text")) { + content += "\n" + part["text"].get<std::string>(); + } + } + } else { + throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); + } + } else { + throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); + } - // if it turns out that our buffer is too small, we resize it - if ((size_t) res > buf.size()) { - buf.resize(res); - res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); + chat.push_back({role, content}); } - const std::string formatted_chat(buf.data(), res); - + auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); - return formatted_chat; } |