From 127c6ee6493a3084995d754d987f0240ffdffe6a Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 19 Mar 2025 19:17:03 +0100 Subject: Honor mmap setting when using tensor overrides (#270) Co-authored-by: Iwan Kawrakow --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/llama.cpp') diff --git a/src/llama.cpp b/src/llama.cpp index 76039f8e..03139e41 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8015,7 +8015,7 @@ static bool llm_load_tensors( // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size - if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) { + if (ml.use_mmap && use_mmap_buffer && (buft == llama_default_buffer_type_cpu(true) || buft == ggml_backend_cpu_buffer_type())) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { void * addr = nullptr; size_t first, last; -- cgit v1.2.3