From 9d0693bce38013364b1042568d9083353bfff48f Mon Sep 17 00:00:00 2001 From: kiltyj Date: Mon, 5 Jun 2023 13:24:04 -0700 Subject: metal : use shared buffers between CPU and GPU (#1696) * Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU * Page-align buffers used by Metal * Remove trailing whitespace * Only import unistd.h for Metal builds * metal : remove unnecessary copies --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index e2511e53..d0e7151f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -53,7 +53,6 @@ enum e_model { MODEL_65B, }; - static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -1281,12 +1280,6 @@ static bool llama_eval_internal( ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); -#ifdef GGML_USE_METAL - if (lctx.ctx_metal && N == 1) { - ggml_metal_set_tensor(lctx.ctx_metal, embd); - } -#endif - struct ggml_tensor * cur; struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); @@ -1484,12 +1477,6 @@ static bool llama_eval_internal( } ggml_graph_compute(ctx0, &gf); - - if (lctx.ctx_metal) { - // We need to sync the CPU KV cache with the GPU KV cache - ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k); - ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v); - } } #else ggml_graph_compute(ctx0, &gf); -- cgit v1.2.3