speculative: add --n-gpu-layers-draft option (#3063)

author: FK <sozforex@gmail.com> 2023-09-13 08:50:46 +0200
committer: GitHub <noreply@github.com> 2023-09-13 08:50:46 +0200
commit: 84e723653ca99d51a74b454984acf2c077468561 (patch)
tree: 62ddb7a849eb2ecf10dc831bf4ea960320e4dd5f
parent: b52b29ab9d601bb298050bcd2261169bc917ba2c (diff)
3 files changed, 15 insertions, 0 deletions
diff --git a/common/common.cpp b/common/common.cpp
index 6e5d5b4d..afc9b8a5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -375,6 +375,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
+        } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params.n_gpu_layers_draft = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
         } else if (arg == "--main-gpu" || arg == "-mg") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
     printf("  -ngl N, --n-gpu-layers N\n");
     printf("                        number of layers to store in VRAM\n");
+    printf("  -ngld N, --n-gpu-layers-draft N\n");
+    printf("                        number of layers to store in VRAM for the draft model\n");
     printf("  -ts SPLIT --tensor-split SPLIT\n");
     printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
     printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
diff --git a/common/common.h b/common/common.h
index 012bf5e1..238635ae 100644
--- a/common/common.h
+++ b/common/common.h
@@ -38,6 +38,7 @@ struct gpt_params {
     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
     int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 822d7b52..2cd153f9 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
 
     // load the draft model
     params.model = params.model_draft;
+    params.n_gpu_layers = params.n_gpu_layers_draft;
     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
     // tokenize the prompt
author	FK <sozforex@gmail.com>	2023-09-13 08:50:46 +0200
committer	GitHub <noreply@github.com>	2023-09-13 08:50:46 +0200
commit	84e723653ca99d51a74b454984acf2c077468561 (patch)
tree	62ddb7a849eb2ecf10dc831bf4ea960320e4dd5f
parent	b52b29ab9d601bb298050bcd2261169bc917ba2c (diff)