summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
Diffstat (limited to 'examples')
-rw-r--r--examples/batched-bench/batched-bench.cpp2
-rw-r--r--examples/llama-bench/llama-bench.cpp16
-rw-r--r--examples/server/server.cpp44
3 files changed, 31 insertions, 31 deletions
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 7924db26..b52d6845 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params();
- const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
+ const std::vector<float> t_split(llama_max_devices(), 0.0f);
model_params.n_gpu_layers = n_gpu_layers;
model_params.tensor_split = t_split.data();
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 542cc7bb..c5a6f744 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -160,7 +160,7 @@ struct cmd_params {
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q;
- std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
+ std::vector<std::vector<float>> tensor_split;
int reps;
bool verbose;
output_formats output_format;
@@ -179,7 +179,7 @@ static const cmd_params cmd_params_defaults = {
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* mul_mat_q */ {true},
- /* tensor_split */ {{}},
+ /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
@@ -380,10 +380,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
const std::regex regex{R"([;/]+)"};
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
- GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+ GGML_ASSERT(split_arg.size() <= llama_max_devices());
- std::array<float, LLAMA_MAX_DEVICES> tensor_split;
- for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+ std::vector<float> tensor_split(llama_max_devices());
+ for (size_t i = 0; i < llama_max_devices(); ++i) {
if (i < split_arg.size()) {
tensor_split[i] = std::stof(split_arg[i]);
} else {
@@ -459,7 +459,7 @@ struct cmd_params_instance {
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
- std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+ std::vector<float> tensor_split;
llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
@@ -582,7 +582,7 @@ struct test {
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
- std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+ std::vector<float> tensor_split;
int n_prompt;
int n_gen;
std::string test_time;
@@ -704,7 +704,7 @@ struct test {
std::vector<std::string> get_values() const {
std::string tensor_split_str;
int max_nonzero = 0;
- for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
+ for (size_t i = 0; i < llama_max_devices(); i++) {
if (tensor_split[i] > 0) {
max_nonzero = i;
}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 21bdce8e..ea77125e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1789,28 +1789,28 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
- if (llama_mlock_supported())
+ if (llama_supports_mlock())
{
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
- if (llama_mmap_supported())
+ if (llama_supports_mmap())
{
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
printf(" --numa attempt optimizations that help on some NUMA systems\n");
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
- printf(" -ngl N, --n-gpu-layers N\n");
- printf(" number of layers to store in VRAM\n");
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
- printf(" how to split the model across multiple GPUs, one of:\n");
- printf(" - none: use one GPU only\n");
- printf(" - layer (default): split layers and KV across GPUs\n");
- printf(" - row: split rows across GPUs\n");
- printf(" -ts SPLIT --tensor-split SPLIT\n");
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
- printf(" or for intermediate results and KV (with split-mode = row)\n");
-#endif
+ if (llama_supports_gpu_offload()) {
+ printf(" -ngl N, --n-gpu-layers N\n");
+ printf(" number of layers to store in VRAM\n");
+ printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
+ printf(" how to split the model across multiple GPUs, one of:\n");
+ printf(" - none: use one GPU only\n");
+ printf(" - layer (default): split layers and KV across GPUs\n");
+ printf(" - row: split rows across GPUs\n");
+ printf(" -ts SPLIT --tensor-split SPLIT\n");
+ printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
+ printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
+ printf(" or for intermediate results and KV (with split-mode = row)\n");
+ }
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -a ALIAS, --alias ALIAS\n");
@@ -2066,13 +2066,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
- params.n_gpu_layers = std::stoi(argv[i]);
-#else
- LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
+ if (llama_supports_gpu_offload()) {
+ params.n_gpu_layers = std::stoi(argv[i]);
+ } else {
+ LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
"See main README.md for information on enabling GPU BLAS support",
{{"n_gpu_layers", params.n_gpu_layers}});
-#endif
+ }
}
else if (arg == "--split-mode" || arg == "-sm")
{
@@ -2115,9 +2115,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
- GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+ GGML_ASSERT(split_arg.size() <= llama_max_devices());
- for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
+ for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
{
if (i_device < split_arg.size())
{