Honor mmap setting when using tensor overrides (#270)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <iwankawrakow@gmail.com> 2025-03-19 19:17:03 +0100
committer: GitHub <noreply@github.com> 2025-03-19 19:17:03 +0100
commit: 127c6ee6493a3084995d754d987f0240ffdffe6a (patch)
tree: 7faeebe448c1fb22837b3551afd963090466944a
parent: 22c84a126f50146a851641ccaa6e8a24f0985d79 (diff)
1 files changed, 1 insertions, 1 deletions
diff --git a/src/llama.cpp b/src/llama.cpp
index 76039f8e..03139e41 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8015,7 +8015,7 @@ static bool llm_load_tensors(
         // only the mmap region containing the tensors in the model is mapped to the backend buffer
         // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
         // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
-        if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
+        if (ml.use_mmap && use_mmap_buffer && (buft == llama_default_buffer_type_cpu(true) || buft == ggml_backend_cpu_buffer_type())) {
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 void * addr = nullptr;
                 size_t first, last;
author	Kawrakow <iwankawrakow@gmail.com>	2025-03-19 19:17:03 +0100
committer	GitHub <noreply@github.com>	2025-03-19 19:17:03 +0100
commit	127c6ee6493a3084995d754d987f0240ffdffe6a (patch)
tree	7faeebe448c1fb22837b3551afd963090466944a
parent	22c84a126f50146a851641ccaa6e8a24f0985d79 (diff)