diff options
Diffstat (limited to 'common')
-rw-r--r-- | common/common.cpp | 6 | ||||
-rw-r--r-- | common/common.h | 2 |
2 files changed, 5 insertions, 3 deletions
diff --git a/common/common.cpp b/common/common.cpp index 31382137..74e1b6fd 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -717,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param lparams.n_ctx = params.n_ctx; lparams.n_batch = params.n_batch; - lparams.n_gpu_layers = params.n_gpu_layers; + if (params.n_gpu_layers != -1) { + lparams.n_gpu_layers = params.n_gpu_layers; + } lparams.main_gpu = params.main_gpu; lparams.tensor_split = params.tensor_split; lparams.low_vram = params.low_vram; @@ -1212,7 +1214,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); - fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers); + fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 105fb09e..85ac0df9 100644 --- a/common/common.h +++ b/common/common.h @@ -34,7 +34,7 @@ struct gpt_params { int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_gpu_layers = 0; // number of layers to store in VRAM + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. |