diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-01-31 17:30:17 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-31 17:30:17 +0200 |
commit | 5cb04dbc16d1da38c8fdcc0111b40e67d00dd1c3 (patch) | |
tree | 3ef8dc640d5c08466309c09a8ac2963bb760af06 /examples/server/server.cpp | |
parent | efb7bdbbd061d087c788598b97992c653f992ddd (diff) |
llama : remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD (#5240)
* llama : remove LLAMA_MAX_DEVICES from llama.h
ggml-ci
* Update llama.cpp
Co-authored-by: slaren <slarengh@gmail.com>
* server : remove LLAMA_MAX_DEVICES
ggml-ci
* llama : remove LLAMA_SUPPORTS_GPU_OFFLOAD
ggml-ci
* train : remove LLAMA_SUPPORTS_GPU_OFFLOAD
* readme : add deprecation notice
* readme : change deprecation notice to "remove" and fix url
* llama : remove gpu includes from llama.h
ggml-ci
---------
Co-authored-by: slaren <slarengh@gmail.com>
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 44 |
1 files changed, 22 insertions, 22 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 21bdce8e..ea77125e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1789,28 +1789,28 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); - if (llama_mlock_supported()) + if (llama_supports_mlock()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } - if (llama_mmap_supported()) + if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } printf(" --numa attempt optimizations that help on some NUMA systems\n"); -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf(" how to split the model across multiple GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf(" - layer (default): split layers and KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); - printf(" or for intermediate results and KV (with split-mode = row)\n"); -#endif + if (llama_supports_gpu_offload()) { + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row)\n"); + } printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -a ALIAS, --alias ALIAS\n"); @@ -2066,13 +2066,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); -#else - LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " + if (llama_supports_gpu_offload()) { + params.n_gpu_layers = std::stoi(argv[i]); + } else { + LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " "See main README.md for information on enabling GPU BLAS support", {{"n_gpu_layers", params.n_gpu_layers}}); -#endif + } } else if (arg == "--split-mode" || arg == "-sm") { @@ -2115,9 +2115,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector<std::string> split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + GGML_ASSERT(split_arg.size() <= llama_max_devices()); - for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) + for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) { if (i_device < split_arg.size()) { |