diff options
Diffstat (limited to 'examples/server')
-rw-r--r-- | examples/server/server.cpp | 40 |
1 files changed, 35 insertions, 5 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1d30a15a..c1ab8f9d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2005,12 +2005,15 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD printf(" -ngl N, --n-gpu-layers N\n"); printf(" number of layers to store in VRAM\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row)\n"); #endif printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); @@ -2254,6 +2257,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, {{"n_gpu_layers", params.n_gpu_layers}}); #endif } + else if (arg == "--split-mode" || arg == "-sm") + { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string arg_next = argv[i]; + if (arg_next == "none") + { + params.split_mode = LLAMA_SPLIT_NONE; + } + else if (arg_next == "layer") + { + params.split_mode = LLAMA_SPLIT_LAYER; + } + else if (arg_next == "row") + { + params.split_mode = LLAMA_SPLIT_ROW; + } + else { + invalid_param = true; + break; + } +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) |