summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-11-01 11:29:07 +0200
committerGitHub <noreply@github.com>2023-11-01 11:29:07 +0200
commitf0e209324a7f663225791897877bf610f1af152d (patch)
tree7badf7944a081030de65e2f38e6db05fb3b240dc
parentca190bca8e844d171020d6147687e71472d71734 (diff)
scripts : add server-llm.sh (#3868)
* scripts : add deploy-server.sh * scripts : rename to server-llm.sh * scripts : working curl pipe
-rw-r--r--scripts/server-llm.sh391
1 files changed, 391 insertions, 0 deletions
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh
new file mode 100644
index 00000000..7bf0929b
--- /dev/null
+++ b/scripts/server-llm.sh
@@ -0,0 +1,391 @@
+#!/bin/bash
+#
+# Helper script for deploying llama.cpp server with a single Bash command
+#
+# - Works on Linux and macOS
+# - Supports: CPU, CUDA, Metal, OpenCL
+# - Can run all GGUF models from HuggingFace
+# - Can serve requests in parallel
+# - Always builds latest llama.cpp from GitHub
+#
+# Limitations
+#
+# - Chat templates are poorly supported (base models recommended)
+# - Might be unstable!
+#
+# Usage:
+# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
+#
+# --port: port number, default is 8888
+# --repo: path to a repo containing GGUF model files
+# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
+# --backend: cpu, cuda, metal, opencl, depends on the OS
+# --gpu-id: gpu id, default is 0
+# --n-parallel: number of parallel requests, default is 8
+# --n-kv: KV cache size, default is 4096
+# --verbose: verbose output
+#
+# Example:
+#
+# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
+#
+
+set -e
+
+# required utils: curl, git, make
+if ! command -v curl &> /dev/null; then
+ printf "[-] curl not found\n"
+ exit 1
+fi
+if ! command -v git &> /dev/null; then
+ printf "[-] git not found\n"
+ exit 1
+fi
+if ! command -v make &> /dev/null; then
+ printf "[-] make not found\n"
+ exit 1
+fi
+
+# parse arguments
+port=8888
+repo=""
+wtype=""
+backend="cpu"
+
+# if macOS, use metal backend by default
+if [[ "$OSTYPE" == "darwin"* ]]; then
+ backend="metal"
+elif command -v nvcc &> /dev/null; then
+ backend="cuda"
+fi
+
+gpu_id=0
+n_parallel=8
+n_kv=4096
+verbose=0
+
+function print_usage {
+ printf "Usage:\n"
+ printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
+ printf " --port: port number, default is 8888\n"
+ printf " --repo: path to a repo containing GGUF model files\n"
+ printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
+ printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
+ printf " --gpu-id: gpu id, default is 0\n"
+ printf " --n-parallel: number of parallel requests, default is 8\n"
+ printf " --n-kv: KV cache size, default is 4096\n"
+ printf " --verbose: verbose output\n\n"
+ printf "Example:\n\n"
+ printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
+}
+
+while [[ $# -gt 0 ]]; do
+ key="$1"
+ case $key in
+ --port)
+ port="$2"
+ shift
+ shift
+ ;;
+ --repo)
+ repo="$2"
+ shift
+ shift
+ ;;
+ --wtype)
+ wtype="$2"
+ shift
+ shift
+ ;;
+ --backend)
+ backend="$2"
+ shift
+ shift
+ ;;
+ --gpu-id)
+ gpu_id="$2"
+ shift
+ shift
+ ;;
+ --n-parallel)
+ n_parallel="$2"
+ shift
+ shift
+ ;;
+ --n-kv)
+ n_kv="$2"
+ shift
+ shift
+ ;;
+ --verbose)
+ verbose=1
+ shift
+ ;;
+ --help)
+ print_usage
+ exit 0
+ ;;
+ *)
+ echo "Unknown argument: $key"
+ print_usage
+ exit 1
+ ;;
+ esac
+done
+
+# available weights types
+wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
+
+wfiles=()
+for wt in "${wtypes[@]}"; do
+ wfiles+=("")
+done
+
+# sample repos
+repos=(
+ "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
+ "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
+ "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
+ "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
+ "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
+ "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
+ "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
+ "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
+ "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
+ "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
+)
+
+printf "\n"
+printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
+printf " Based on the options that follow, the script might download a model file\n"
+printf " from the internet, which can be a few GBs in size. The script will also\n"
+printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
+printf "\n"
+printf " Upon success, an HTTP server will be started and it will serve the selected\n"
+printf " model using llama.cpp for demonstration purposes.\n"
+printf "\n"
+printf " Please note:\n"
+printf "\n"
+printf " - All new data will be stored in the current folder\n"
+printf " - The server will be listening on all network interfaces\n"
+printf " - The server will run with default settings which are not always optimal\n"
+printf " - Do not judge the quality of a model based on the results from this script\n"
+printf " - Do not use this script to benchmark llama.cpp\n"
+printf " - Do not use this script in production\n"
+printf " - This script is only for demonstration purposes\n"
+printf "\n"
+printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
+printf "\n"
+printf " Press Enter to continue ...\n\n"
+
+read
+
+if [[ -z "$repo" ]]; then
+ printf "[+] No repo provided from the command line\n"
+ printf " Please select a number from the list below or enter an URL:\n\n"
+
+ is=0
+ for r in "${repos[@]}"; do
+ printf " %2d) %s\n" $is "$r"
+ is=$((is+1))
+ done
+
+ # ask for repo until index of sample repo is provided or an URL
+ while [[ -z "$repo" ]]; do
+ printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
+ read -p "[+] Select repo: " repo
+
+ # check if the input is a number
+ if [[ "$repo" =~ ^[0-9]+$ ]]; then
+ if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
+ repo="${repos[$repo]}"
+ else
+ printf "[-] Invalid repo index: %s\n" "$repo"
+ repo=""
+ fi
+ elif [[ "$repo" =~ ^https?:// ]]; then
+ repo="$repo"
+ else
+ printf "[-] Invalid repo URL: %s\n" "$repo"
+ repo=""
+ fi
+ done
+fi
+
+# remove suffix
+repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
+
+printf "[+] Checking for GGUF model files in %s\n" "$repo"
+
+# find GGUF files in the source
+# TODO: better logic
+model_tree="${repo%/}/tree/main"
+model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
+
+# list all files in the provided git repo
+printf "[+] Model files:\n\n"
+for file in $model_files; do
+ # determine iw by grepping the filename with wtypes
+ iw=-1
+ is=0
+ for wt in "${wtypes[@]}"; do
+ # uppercase
+ ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
+ if [[ "$ufile" =~ "$wt" ]]; then
+ iw=$is
+ break
+ fi
+ is=$((is+1))
+ done
+
+ if [[ $iw -eq -1 ]]; then
+ continue
+ fi
+
+ wfiles[$iw]="$file"
+
+ have=" "
+ if [[ -f "$file" ]]; then
+ have="*"
+ fi
+
+ printf " %2d) %s %s\n" $iw "$have" "$file"
+done
+
+# ask for weights type until provided and available
+while [[ -z "$wtype" ]]; do
+ printf "\n"
+ read -p "[+] Select weight type: " wtype
+ wfile="${wfiles[$wtype]}"
+
+ if [[ -z "$wfile" ]]; then
+ printf "[-] Invalid weight type: %s\n" "$wtype"
+ wtype=""
+ fi
+done
+
+printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
+
+url="${repo%/}/resolve/main/$wfile"
+
+# check file if the model has been downloaded before
+chk="$wfile.chk"
+
+# check if we should download the file
+# - if $wfile does not exist
+# - if $wfile exists but $chk does not exist
+# - if $wfile exists and $chk exists but $wfile is newer than $chk
+# TODO: better logic using git lfs info
+
+do_download=0
+
+if [[ ! -f "$wfile" ]]; then
+ do_download=1
+elif [[ ! -f "$chk" ]]; then
+ do_download=1
+elif [[ "$wfile" -nt "$chk" ]]; then
+ do_download=1
+fi
+
+if [[ $do_download -eq 1 ]]; then
+ printf "[+] Downloading weights from %s\n" "$url"
+
+ # download the weights file
+ curl -o "$wfile" -# -L "$url"
+
+ # create a check file if successful
+ if [[ $? -eq 0 ]]; then
+ printf "[+] Creating check file %s\n" "$chk"
+ touch "$chk"
+ fi
+else
+ printf "[+] Using cached weights %s\n" "$wfile"
+fi
+
+# get latest llama.cpp and build
+
+printf "[+] Downloading latest llama.cpp\n"
+
+llama_cpp_dir="__llama_cpp_port_${port}__"
+
+if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
+ # if the dir exists and there isn't a file "__ggml_script__" in it, abort
+ printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
+ printf "[-] Please remove it and try again\n"
+ exit 1
+elif [[ -d "$llama_cpp_dir" ]]; then
+ printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
+ printf "[+] Using cached llama.cpp\n"
+
+ cd "$llama_cpp_dir"
+ git reset --hard
+ git fetch
+ git checkout origin/master
+
+ cd ..
+else
+ printf "[+] Cloning llama.cpp\n"
+
+ git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
+fi
+
+# mark that that the directory is made by this script
+touch "$llama_cpp_dir/__ggml_script__"
+
+if [[ $verbose -eq 1 ]]; then
+ set -x
+fi
+
+# build
+cd "$llama_cpp_dir"
+
+make clean
+
+log="--silent"
+if [[ $verbose -eq 1 ]]; then
+ log=""
+fi
+
+if [[ "$backend" == "cuda" ]]; then
+ printf "[+] Building with CUDA backend\n"
+ LLAMA_CUBLAS=1 make -j server $log
+elif [[ "$backend" == "cpu" ]]; then
+ printf "[+] Building with CPU backend\n"
+ make -j server $log
+elif [[ "$backend" == "metal" ]]; then
+ printf "[+] Building with Metal backend\n"
+ make -j server $log
+elif [[ "$backend" == "opencl" ]]; then
+ printf "[+] Building with OpenCL backend\n"
+ LLAMA_CLBLAST=1 make -j server $log
+else
+ printf "[-] Unknown backend: %s\n" "$backend"
+ exit 1
+fi
+
+# run the server
+
+printf "[+] Running server\n"
+
+args=""
+if [[ "$backend" == "cuda" ]]; then
+ export CUDA_VISIBLE_DEVICES=$gpu_id
+ args="-ngl 999"
+elif [[ "$backend" == "cpu" ]]; then
+ args="-ngl 0"
+elif [[ "$backend" == "metal" ]]; then
+ args="-ngl 999"
+elif [[ "$backend" == "opencl" ]]; then
+ args="-ngl 999"
+else
+ printf "[-] Unknown backend: %s\n" "$backend"
+ exit 1
+fi
+
+if [[ $verbose -eq 1 ]]; then
+ args="$args --verbose"
+fi
+
+./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
+
+exit 0