summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
Diffstat (limited to 'examples')
-rw-r--r--examples/batched-bench/batched-bench.cpp3
-rw-r--r--examples/llama-bench/llama-bench.cpp146
-rw-r--r--examples/server/server.cpp40
3 files changed, 116 insertions, 73 deletions
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 57596ed9..7924db26 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -88,7 +88,10 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params();
+ const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
+
model_params.n_gpu_layers = n_gpu_layers;
+ model_params.tensor_split = t_split.data();
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 7f7186cd..97325b5b 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -128,6 +128,25 @@ static std::string get_gpu_info() {
// command line params
enum output_formats {CSV, JSON, MARKDOWN, SQL};
+static const char * output_format_str(output_formats format) {
+ switch (format) {
+ case CSV: return "csv";
+ case JSON: return "json";
+ case MARKDOWN: return "md";
+ case SQL: return "sql";
+ default: GGML_ASSERT(!"invalid output format");
+ }
+}
+
+static const char * split_mode_str(llama_split_mode mode) {
+ switch (mode) {
+ case LLAMA_SPLIT_NONE: return "none";
+ case LLAMA_SPLIT_LAYER: return "layer";
+ case LLAMA_SPLIT_ROW: return "row";
+ default: GGML_ASSERT(!"invalid split mode");
+ }
+}
+
struct cmd_params {
std::vector<std::string> model;
std::vector<int> n_prompt;
@@ -137,6 +156,7 @@ struct cmd_params {
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
std::vector<int> n_gpu_layers;
+ std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q;
@@ -155,6 +175,7 @@ static const cmd_params cmd_params_defaults = {
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99},
+ /* split_mode */ {LLAMA_SPLIT_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* mul_mat_q */ {true},
@@ -169,21 +190,22 @@ static void print_usage(int /* argc */, char ** argv) {
printf("\n");
printf("options:\n");
printf(" -h, --help\n");
- printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
- printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
- printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
- printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
- printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
- printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
- printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
- printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
- printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
- printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
- printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
- printf(" -ts, --tensor_split <ts0/ts1/..> \n");
- printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
- printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
- printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+ printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+ printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+ printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+ printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+ printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+ printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+ printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+ printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+ printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
+ printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+ printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
+ printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
+ printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
+ printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
+ printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
+ printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
}
@@ -306,6 +328,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+ } else if (arg == "-sm" || arg == "--split-mode") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = split<std::string>(argv[i], split_delim);
+ std::vector<llama_split_mode> modes;
+ for (const auto & m : p) {
+ llama_split_mode mode;
+ if (m == "none") {
+ mode = LLAMA_SPLIT_NONE;
+ } else if (m == "layer") {
+ mode = LLAMA_SPLIT_LAYER;
+ } else if (m == "row") {
+ mode = LLAMA_SPLIT_ROW;
+ } else {
+ invalid_param = true;
+ break;
+ }
+ modes.push_back(mode);
+ }
+ params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
} else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
invalid_param = true;
@@ -392,6 +436,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
+ if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -410,6 +455,7 @@ struct cmd_params_instance {
ggml_type type_v;
int n_threads;
int n_gpu_layers;
+ llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
@@ -419,6 +465,7 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
+ mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
@@ -428,6 +475,7 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model &&
n_gpu_layers == other.n_gpu_layers &&
+ split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
tensor_split == other.tensor_split;
}
@@ -446,45 +494,13 @@ struct cmd_params_instance {
}
};
-static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
- std::vector<cmd_params_instance> instances;
-
- for (const auto & m : params.model)
- for (const auto & nl : params.n_gpu_layers)
- for (const auto & mg : params.main_gpu)
- for (const auto & ts : params.tensor_split)
- for (const auto & nb : params.n_batch)
- for (const auto & tk : params.type_k)
- for (const auto & tv : params.type_v)
- for (const auto & mmq : params.mul_mat_q)
- for (const auto & nkvo : params.no_kv_offload)
- for (const auto & nt : params.n_threads) {
- cmd_params_instance instance = {
- /* .model = */ m,
- /* .n_prompt = */ n_prompt,
- /* .n_gen = */ n_gen,
- /* .n_batch = */ nb,
- /* .type_k = */ tk,
- /* .type_v = */ tv,
- /* .n_threads = */ nt,
- /* .n_gpu_layers = */ nl,
- /* .main_gpu = */ mg,
- /* .no_kv_offload= */ nkvo,
- /* .mul_mat_q = */ mmq,
- /* .tensor_split = */ ts,
- };
- instances.push_back(instance);
- }
- return instances;
-}
-
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
std::vector<cmd_params_instance> instances;
-#if 1
// this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
+ for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & nb : params.n_batch)
@@ -506,6 +522,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
+ /* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
@@ -527,6 +544,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
+ /* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq,
@@ -535,24 +553,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
instances.push_back(instance);
}
}
-#else
- // this ordering separates the prompt and generation tests
- for (const auto & n_prompt : params.n_prompt) {
- if (n_prompt == 0) {
- continue;
- }
- auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
- instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
- }
-
- for (const auto & n_gen : params.n_gen) {
- if (n_gen == 0) {
- continue;
- }
- auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
- instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
- }
-#endif
return instances;
}
@@ -576,6 +576,7 @@ struct test {
ggml_type type_k;
ggml_type type_v;
int n_gpu_layers;
+ llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool mul_mat_q;
@@ -597,6 +598,7 @@ struct test {
type_k = inst.type_k;
type_v = inst.type_v;
n_gpu_layers = inst.n_gpu_layers;
+ split_mode = inst.split_mode;
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
mul_mat_q = inst.mul_mat_q;
@@ -660,7 +662,8 @@ struct test {
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v",
- "n_gpu_layers", "main_gpu", "no_kv_offload",
+ "n_gpu_layers", "split_mode",
+ "main_gpu", "no_kv_offload",
"mul_mat_q", "tensor_split",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
@@ -711,7 +714,8 @@ struct test {
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
- std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload),
+ std::to_string(n_gpu_layers), split_mode_str(split_mode),
+ std::to_string(main_gpu), std::to_string(no_kv_offload),
std::to_string(mul_mat_q), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -867,6 +871,9 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") {
return "ngl";
}
+ if (field == "split_mode") {
+ return "sm";
+ }
if (field == "n_threads") {
return "threads";
}
@@ -907,6 +914,9 @@ struct markdown_printer : public printer {
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.push_back("main_gpu");
}
+ if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
+ fields.push_back("split_mode");
+ }
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
fields.push_back("mul_mat_q");
}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1d30a15a..c1ab8f9d 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2005,12 +2005,15 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
+ printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
+ printf(" how to split the model across multiple GPUs, one of:\n");
+ printf(" - none: use one GPU only\n");
+ printf(" - layer (default): split layers and KV across GPUs\n");
+ printf(" - row: split rows across GPUs\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
- printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
- printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
- printf(" -nommq, --no-mul-mat-q\n");
- printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
- printf(" Not recommended since this is both slower and uses more VRAM.\n");
+ printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
+ printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
+ printf(" or for intermediate results and KV (with split-mode = row)\n");
#endif
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
@@ -2254,6 +2257,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
{{"n_gpu_layers", params.n_gpu_layers}});
#endif
}
+ else if (arg == "--split-mode" || arg == "-sm")
+ {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ std::string arg_next = argv[i];
+ if (arg_next == "none")
+ {
+ params.split_mode = LLAMA_SPLIT_NONE;
+ }
+ else if (arg_next == "layer")
+ {
+ params.split_mode = LLAMA_SPLIT_LAYER;
+ }
+ else if (arg_next == "row")
+ {
+ params.split_mode = LLAMA_SPLIT_ROW;
+ }
+ else {
+ invalid_param = true;
+ break;
+ }
+#ifndef GGML_USE_CUBLAS
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS
+ }
else if (arg == "--tensor-split" || arg == "-ts")
{
if (++i >= argc)