diff options
Diffstat (limited to 'examples/server/server.cpp')
-rw-r--r-- | examples/server/server.cpp | 15 |
1 files changed, 12 insertions, 3 deletions
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9eacc929..9aa7db25 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -385,7 +385,9 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); + fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); fprintf(stderr, " --embedding enable embedding mode\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); if (llama_mlock_supported()) @@ -396,8 +398,10 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) { fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stderr, " number of layers to store in VRAM\n"); +#endif fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); @@ -473,7 +477,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para server_print_usage(argc, argv, default_params); exit(0); } - else if (arg == "-c" || arg == "--ctx_size") + else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { if (++i >= argc) { @@ -482,7 +486,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } params.n_ctx = std::stoi(argv[i]); } - else if (arg == "--memory_f32") + else if (arg == "--memory-f32" || arg == "--memory_f32") { params.memory_f16 = false; } @@ -493,7 +497,12 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para invalid_param = true; break; } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif } else { |