speculative: add --n-gpu-layers-draft option (#3063)

author: FK <sozforex@gmail.com> 2023-09-13 08:50:46 +0200
committer: GitHub <noreply@github.com> 2023-09-13 08:50:46 +0200
commit: 84e723653ca99d51a74b454984acf2c077468561 (patch)
tree: 62ddb7a849eb2ecf10dc831bf4ea960320e4dd5f /common/common.cpp
parent: b52b29ab9d601bb298050bcd2261169bc917ba2c (diff)
1 files changed, 13 insertions, 0 deletions
diff --git a/common/common.cpp b/common/common.cpp
index 6e5d5b4d..afc9b8a5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -375,6 +375,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
+        } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params.n_gpu_layers_draft = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
         } else if (arg == "--main-gpu" || arg == "-mg") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
     printf("  -ngl N, --n-gpu-layers N\n");
     printf("                        number of layers to store in VRAM\n");
+    printf("  -ngld N, --n-gpu-layers-draft N\n");
+    printf("                        number of layers to store in VRAM for the draft model\n");
     printf("  -ts SPLIT --tensor-split SPLIT\n");
     printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
     printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
author	FK <sozforex@gmail.com>	2023-09-13 08:50:46 +0200
committer	GitHub <noreply@github.com>	2023-09-13 08:50:46 +0200
commit	84e723653ca99d51a74b454984acf2c077468561 (patch)
tree	62ddb7a849eb2ecf10dc831bf4ea960320e4dd5f /common/common.cpp
parent	b52b29ab9d601bb298050bcd2261169bc917ba2c (diff)