build : on Mac OS enable Metal by default (#2901)

* build : on Mac OS enable Metal by default * make : try to fix build on Linux * make : move targets back to the top * make : fix target clean * llama : enable GPU inference by default with Metal * llama : fix vocab_only logic when GPU is enabled * common : better `n_gpu_layers` assignment * readme : update Metal instructions * make : fix merge conflict remnants * gitignore : metal
author: Georgi Gerganov <ggerganov@gmail.com> 2023-09-04 22:26:24 +0300
committer: GitHub <noreply@github.com> 2023-09-04 22:26:24 +0300
commit: e36ecdccc8754783f93ad3ac8a09e540101f2ca0 (patch)
tree: 160ce80ac89ad8d938c5a58bcb5aae4cdb020636 /common
parent: bd33e5ab92e7f214205792fc1cd9ca28e810f897 (diff)
2 files changed, 5 insertions, 3 deletions
diff --git a/common/common.cpp b/common/common.cpp
index 31382137..74e1b6fd 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -717,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
     lparams.n_ctx           = params.n_ctx;
     lparams.n_batch         = params.n_batch;
-    lparams.n_gpu_layers    = params.n_gpu_layers;
+    if (params.n_gpu_layers != -1) {
+        lparams.n_gpu_layers = params.n_gpu_layers;
+    }
     lparams.main_gpu        = params.main_gpu;
     lparams.tensor_split    = params.tensor_split;
     lparams.low_vram        = params.low_vram;
@@ -1212,7 +1214,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
     fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
-    fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
+    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
     fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
diff --git a/common/common.h b/common/common.h
index 105fb09e..85ac0df9 100644
--- a/common/common.h
+++ b/common/common.h
@@ -34,7 +34,7 @@ struct gpt_params {
     int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
-    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
+    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
author	Georgi Gerganov <ggerganov@gmail.com>	2023-09-04 22:26:24 +0300
committer	GitHub <noreply@github.com>	2023-09-04 22:26:24 +0300
commit	e36ecdccc8754783f93ad3ac8a09e540101f2ca0 (patch)
tree	160ce80ac89ad8d938c5a58bcb5aae4cdb020636 /common
parent	bd33e5ab92e7f214205792fc1cd9ca28e810f897 (diff)