diff options
author | Radoslav Gerganov <rgerganov@gmail.com> | 2024-05-14 14:27:19 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-14 14:27:19 +0300 |
commit | 5e31828d3e35c76ecfee665bc23771a4bec1d130 (patch) | |
tree | 7f5f2edc7c3fc3e7655904316897e32202edd5d6 /common | |
parent | 541600201e6480f54ae09e58d16b154d4b4b331d (diff) |
ggml : add RPC backend (#6829)
* ggml : add RPC backend
The RPC backend proxies all operations to a remote server which runs a
regular backend (CPU, CUDA, Metal, etc).
* set TCP_NODELAY
* add CI workflows
* Address review comments
* fix warning
* implement llama_max_devices() for RPC
* Address review comments
* Address review comments
* wrap sockfd into a struct
* implement get_alignment and get_max_size
* add get_device_memory
* fix warning
* win32 support
* add README
* readme : trim trailing whitespace
* Address review comments
* win32 fix
* Address review comments
* fix compile warnings on macos
Diffstat (limited to 'common')
-rw-r--r-- | common/common.cpp | 10 | ||||
-rw-r--r-- | common/common.h | 1 |
2 files changed, 11 insertions, 0 deletions
diff --git a/common/common.cpp b/common/common.cpp index ba1ecf0e..96130ad5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1060,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa #endif // GGML_USE_CUDA_SYCL_VULKAN return true; } + if (arg == "--rpc") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rpc_servers = argv[i]; + return true; + } if (arg == "--no-mmap") { params.use_mmap = false; return true; @@ -1557,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); } + printf(" --rpc SERVERS comma separated list of RPC servers\n"); printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n"); @@ -1830,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; diff --git a/common/common.h b/common/common.h index d80344f2..566490e2 100644 --- a/common/common.h +++ b/common/common.h @@ -82,6 +82,7 @@ struct gpt_params { float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + std::string rpc_servers = ""; // comma separated list of RPC servers ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; |