summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFK <sozforex@gmail.com>2023-09-13 08:50:46 +0200
committerGitHub <noreply@github.com>2023-09-13 08:50:46 +0200
commit84e723653ca99d51a74b454984acf2c077468561 (patch)
tree62ddb7a849eb2ecf10dc831bf4ea960320e4dd5f
parentb52b29ab9d601bb298050bcd2261169bc917ba2c (diff)
speculative: add --n-gpu-layers-draft option (#3063)
-rw-r--r--common/common.cpp13
-rw-r--r--common/common.h1
-rw-r--r--examples/speculative/speculative.cpp1
3 files changed, 15 insertions, 0 deletions
diff --git a/common/common.cpp b/common/common.cpp
index 6e5d5b4d..afc9b8a5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -375,6 +375,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
+ } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+ params.n_gpu_layers_draft = std::stoi(argv[i]);
+#else
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
@@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
+ printf(" -ngld N, --n-gpu-layers-draft N\n");
+ printf(" number of layers to store in VRAM for the draft model\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
diff --git a/common/common.h b/common/common.h
index 012bf5e1..238635ae 100644
--- a/common/common.h
+++ b/common/common.h
@@ -38,6 +38,7 @@ struct gpt_params {
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 822d7b52..2cd153f9 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
// load the draft model
params.model = params.model_draft;
+ params.n_gpu_layers = params.n_gpu_layers_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
// tokenize the prompt