Only show -ngl option when relevant + other doc/arg handling updates (#1625)

1. Add a `LLAMA_SUPPORTS_GPU_OFFLOAD` define to `llama.h` (defined when compiled with CLBlast or cuBLAS) 2. Update the argument handling in the common example code to only show the `-ngl`, `--n-gpu-layers` option when GPU offload is possible. 3. Add an entry for the `-ngl`, `--n-gpu-layers` option to the `main` and `server` examples documentation 4. Update `main` and `server` examples documentation to use the new style dash separator argument format 5. Update the `server` example to use dash separators for its arguments and adds `-ngl` to `--help` (only shown when compiled with appropriate support). It will still support `--memory_f32` and `--ctx_size` for compatibility. 6. Add a warning discouraging use of `--memory-f32` for the `main` and `server` examples `--help` text as well as documentation. Rationale: https://github.com/ggerganov/llama.cpp/discussions/1593#discussioncomment-6004356
author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> 2023-05-28 11:48:57 -0600
committer: GitHub <noreply@github.com> 2023-05-28 11:48:57 -0600
commit: 1b78ed20818b72306edc7208b9bfb69a1a0d3297 (patch)
tree: 8c72e202378906da0cf7559755356e16f22d27c0 /examples/common.cpp
parent: 337aea11390221bc925e4acb1f603f1649af2735 (diff)
1 files changed, 9 insertions, 1 deletions
diff --git a/examples/common.cpp b/examples/common.cpp
index 478dbafc..32247cef 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -289,7 +289,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
             params.n_gpu_layers = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
         } else if (arg == "--mtest") {
@@ -416,7 +421,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
     fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value\n");
+    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
     fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
     fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
@@ -427,8 +433,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     if (llama_mmap_supported()) {
         fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
     }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
     fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
     fprintf(stderr, "                        number of layers to store in VRAM\n");
+#endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
author	Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>	2023-05-28 11:48:57 -0600
committer	GitHub <noreply@github.com>	2023-05-28 11:48:57 -0600
commit	1b78ed20818b72306edc7208b9bfb69a1a0d3297 (patch)
tree	8c72e202378906da0cf7559755356e16f22d27c0 /examples/common.cpp
parent	337aea11390221bc925e4acb1f603f1649af2735 (diff)