build : on Mac OS enable Metal by default (#2901)

* build : on Mac OS enable Metal by default * make : try to fix build on Linux * make : move targets back to the top * make : fix target clean * llama : enable GPU inference by default with Metal * llama : fix vocab_only logic when GPU is enabled * common : better `n_gpu_layers` assignment * readme : update Metal instructions * make : fix merge conflict remnants * gitignore : metal
author: Georgi Gerganov <ggerganov@gmail.com> 2023-09-04 22:26:24 +0300
committer: GitHub <noreply@github.com> 2023-09-04 22:26:24 +0300
commit: e36ecdccc8754783f93ad3ac8a09e540101f2ca0 (patch)
tree: 160ce80ac89ad8d938c5a58bcb5aae4cdb020636 /llama.cpp
parent: bd33e5ab92e7f214205792fc1cd9ca28e810f897 (diff)
1 files changed, 29 insertions, 25 deletions
diff --git a/llama.cpp b/llama.cpp
index c97c1462..b9485df0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5340,7 +5340,7 @@ struct llama_context_params llama_context_default_params() {
         /*.seed                        =*/ LLAMA_DEFAULT_SEED,
         /*.n_ctx                       =*/ 512,
         /*.n_batch                     =*/ 512,
-        /*.gpu_layers                  =*/ 0,
+        /*.n_gpu_layers                =*/ 0,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.rope_freq_base              =*/ 10000.0f,
@@ -5357,6 +5357,10 @@ struct llama_context_params llama_context_default_params() {
         /*.embedding                   =*/ false,
     };
 
+#ifdef GGML_USE_METAL
+    result.n_gpu_layers = 1;
+#endif
+
     return result;
 }
 
@@ -5549,43 +5553,43 @@ struct llama_context * llama_new_context_with_model(
             }
 #endif
         }
-    }
 
 #ifdef GGML_USE_METAL
-    if (params.n_gpu_layers > 0) {
-        // this allocates all Metal resources and memory buffers
+        if (params.n_gpu_layers > 0) {
+            // this allocates all Metal resources and memory buffers
 
-        void * data_ptr  = NULL;
-        size_t data_size = 0;
+            void * data_ptr  = NULL;
+            size_t data_size = 0;
 
-        if (params.use_mmap) {
-            data_ptr  = ctx->model.mapping->addr;
-            data_size = ctx->model.mapping->size;
-        } else {
-            data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
-            data_size = ggml_get_mem_size  (ctx->model.ctx);
-        }
+            if (params.use_mmap) {
+                data_ptr  = ctx->model.mapping->addr;
+                data_size = ctx->model.mapping->size;
+            } else {
+                data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
+                data_size = ggml_get_mem_size  (ctx->model.ctx);
+            }
 
-        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+            const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
 
-        LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
 
 #define LLAMA_METAL_CHECK_BUF(result)                            \
-    if (!(result)) {                                             \
-        LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
-        llama_free(ctx);                                         \
-        return NULL;                                             \
-    }
+            if (!(result)) {                                             \
+                LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
+                llama_free(ctx);                                         \
+                return NULL;                                             \
+            }
 
-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
+            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
 
-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
+            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
+            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
 
-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
+            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
 #undef LLAMA_METAL_CHECK_BUF
-    }
+        }
 #endif
+    }
 
 #ifdef GGML_USE_MPI
     ctx->ctx_mpi = ggml_mpi_init();
author	Georgi Gerganov <ggerganov@gmail.com>	2023-09-04 22:26:24 +0300
committer	GitHub <noreply@github.com>	2023-09-04 22:26:24 +0300
commit	e36ecdccc8754783f93ad3ac8a09e540101f2ca0 (patch)
tree	160ce80ac89ad8d938c5a58bcb5aae4cdb020636 /llama.cpp
parent	bd33e5ab92e7f214205792fc1cd9ca28e810f897 (diff)