diff options
Diffstat (limited to 'examples')
-rw-r--r-- | examples/CMakeLists.txt | 3 | ||||
-rw-r--r-- | examples/rpc/CMakeLists.txt | 2 | ||||
-rw-r--r-- | examples/rpc/README.md | 74 | ||||
-rw-r--r-- | examples/rpc/rpc-server.cpp | 70 |
4 files changed, 149 insertions, 0 deletions
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f421769c..b40ee4cc 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -49,4 +49,7 @@ else() add_subdirectory(server) endif() add_subdirectory(export-lora) + if (LLAMA_RPC) + add_subdirectory(rpc) + endif() endif() diff --git a/examples/rpc/CMakeLists.txt b/examples/rpc/CMakeLists.txt new file mode 100644 index 00000000..ae48fb98 --- /dev/null +++ b/examples/rpc/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(rpc-server rpc-server.cpp) +target_link_libraries(rpc-server PRIVATE ggml llama) diff --git a/examples/rpc/README.md b/examples/rpc/README.md new file mode 100644 index 00000000..325d0abc --- /dev/null +++ b/examples/rpc/README.md @@ -0,0 +1,74 @@ +## Overview + +The `rpc-server` allows running `ggml` backend on a remote host. +The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. +This can be used for distributed LLM inference with `llama.cpp` in the following way: + +```mermaid +flowchart TD + rpcb---|TCP|srva + rpcb---|TCP|srvb + rpcb-.-|TCP|srvn + subgraph hostn[Host N] + srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"] + end + subgraph hostb[Host B] + srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"] + end + subgraph hosta[Host A] + srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"] + end + subgraph host[Main Host] + ggml[llama.cpp]---rpcb[RPC backend] + end + style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5 +``` + +Each host can run a different backend, e.g. one with CUDA and another with Metal. +You can also run multiple `rpc-server` instances on the same host, each with a different backend. + +## Usage + +On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options. +For example, to build the CUDA backend with RPC support: + +```bash +mkdir build-rpc-cuda +cd build-rpc-cuda +cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON +cmake --build . --config Release +``` + +Then, start the `rpc-server` with the backend: + +```bash +$ bin/rpc-server 0.0.0.0 50052 +create_backend: using CUDA backend +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes +Starting RPC server on 0.0.0.0:50052 +``` + +When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.: +```bash +$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052 +``` +This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. + + +On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`: + +```bash +mkdir build-rpc +cd build-rpc +cmake .. -DLLAMA_RPC=ON +cmake --build . --config Release +``` + +Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: + +```bash +$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 +``` diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp new file mode 100644 index 00000000..496af849 --- /dev/null +++ b/examples/rpc/rpc-server.cpp @@ -0,0 +1,70 @@ +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include "ggml-rpc.h" +#include <string> +#include <stdio.h> + +static ggml_backend_t create_backend() { + ggml_backend_t backend = NULL; +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#elif GGML_USE_METAL + fprintf(stderr, "%s: using Metal backend\n", __func__); + backend = ggml_backend_metal_init(); + if (!backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } +#endif + + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + fprintf(stderr, "%s: using CPU backend\n", __func__); + backend = ggml_backend_cpu_init(); + } + return backend; +} + +static void get_backend_memory(size_t * free_mem, size_t * total_mem) { +#ifdef GGML_USE_CUDA + ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); +#else + // TODO: implement for other backends + *free_mem = 1; + *total_mem = 1; +#endif +} + +int main(int argc, char * argv[]) { + if (argc < 3) { + fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]); + return 1; + } + const char * host = argv[1]; + int port = std::stoi(argv[2]); + if (port <= 0 || port > 65535) { + fprintf(stderr, "Invalid port number: %d\n", port); + return 1; + } + ggml_backend_t backend = create_backend(); + if (!backend) { + fprintf(stderr, "Failed to create backend\n"); + return 1; + } + printf("Starting RPC server on %s:%d\n", host, port); + size_t free_mem, total_mem; + get_backend_memory(&free_mem, &total_mem); + std::string endpoint = std::string(host) + ":" + std::to_string(port); + start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); + ggml_backend_free(backend); + return 0; +} |