24 files changed, 66 insertions, 29 deletions
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index b52d6845..55dfd978 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -82,7 +82,8 @@ int main(int argc, char ** argv) {
 
     // init LLM
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     // initialize the model
 
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 4d000534..d75c503d 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -17,7 +17,7 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu
 let n_len: Int = 32
 
 // init LLM
-llama_backend_init(false)
+llama_backend_init()
 defer {
     llama_backend_free()
 }
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index b1775e0b..eab63669 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -50,7 +50,8 @@ int main(int argc, char ** argv) {
 
     // init LLM
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     // initialize the model
 
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
index 679b382e..866c6d7a 100644
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -119,7 +119,8 @@ int main(int argc, char ** argv)
     // Init LLM :
     //---------------------------------
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index b4688cf5..acff715e 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -74,7 +74,8 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index bc9f6fa6..f21bc48f 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -568,7 +568,8 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model_params mparams = llama_model_params_from_gpt_params(params);
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 72fb133b..92c67b7c 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -202,7 +202,8 @@ int main(int argc, char ** argv) {
     std::mt19937 rng(params.seed);
 
     LOG("%s: llama backend init\n", __func__);
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index ddb0ba06..11410f8a 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1151,8 +1151,7 @@ int main(int argc, char ** argv) {
     if (!params.verbose) {
         llama_log_set(llama_null_log_callback, NULL);
     }
-    bool numa = false;
-    llama_backend_init(numa);
+    llama_backend_init();
 
     // initialize printer
     std::unique_ptr<printer> p;
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp
index d5e705dc..2beb1e0d 100644
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -274,8 +274,8 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
-    llama_backend_init(numa);
+Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
+    llama_backend_init();
 }
 
 extern "C"
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index fc79fd34..58fcf40c 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -51,7 +51,7 @@ actor LlamaContext {
     }
 
     static func create_context(path: String) throws -> LlamaContext {
-        llama_backend_init(false)
+        llama_backend_init()
         var model_params = llama_model_default_params()
 
 #if targetEnvironment(simulator)
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index bef7f7c9..e29da6cb 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -218,7 +218,8 @@ static struct llava_context * llava_init(gpt_params * params) {
 
     auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
 
-    llama_backend_init(params->numa);
+    llama_backend_init();
+    llama_numa_init(params->numa);
 
     llama_model_params model_params = llama_model_params_from_gpt_params(*params);
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index e55a15a1..e2551e7a 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -54,7 +54,8 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
 
     // init llama.cpp
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model = NULL;
     llama_context * ctx = NULL;
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 18235b8a..b53fae11 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -31,7 +31,8 @@ int main(int argc, char ** argv){
 #endif // LOG_DISABLE_LOGS
 
     // init llama.cpp
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model = NULL;
     llama_context * ctx = NULL;
diff --git a/examples/main/README.md b/examples/main/README.md
index c7997f66..7f84e426 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -283,7 +283,11 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 ### NUMA support
 
--   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
+-   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
+-   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
+-   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
+
+ These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
 
 ### Memory Float 32
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index e8ab8cba..f5d2f489 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -185,7 +185,8 @@ int main(int argc, char ** argv) {
     }
 
     LOG("%s: llama backend init\n", __func__);
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index d2e074d9..7d11fcd5 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -122,7 +122,8 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
 
     // init llama.cpp
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model = NULL;
     llama_context * ctx = NULL;
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 5c002283..e12a1cdf 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -71,7 +71,8 @@ int main(int argc, char ** argv) {
 
     // init LLM
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     // initialize the model
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b2c131d4..67d2d329 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1809,7 +1809,8 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 85f403ff..4a5c504e 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -237,7 +237,7 @@ int main(int argc, char ** argv) {
         params.imatrix = &imatrix_data;
     }
 
-    llama_backend_init(false);
+    llama_backend_init();
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
diff --git a/examples/server/README.md b/examples/server/README.md
index 0f7373ae..8e141d22 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -16,6 +16,13 @@ Command line options:
 - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+- `--numa STRATEGY`: Attempt one of the below optimization strategies  that help on some NUMA systems
+- `--numa distribute`: Spread execution evenly over all nodes
+- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
+- `--numa numactl`: Use the CPU map provided by numactl
+if run without this previously, it is recommended to drop the system page cache before using this
+see https://github.com/ggerganov/llama.cpp/issues/1437
+
 - `--numa`: Attempt optimizations that help on some NUMA systems.
 - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2decd776..912c750c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1855,7 +1855,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     {
         printf("  --no-mmap                 do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
     }
-    printf("  --numa                    attempt optimizations that help on some NUMA systems\n");
+    printf("  --numa TYPE               attempt optimizations that help on some NUMA systems\n");
+    printf("                              - distribute: spread execution evenly over all nodes\n");
+    printf("                              - isolate: only spawn threads on CPUs on the node that execution started on\n");
+    printf("                              - numactl: use the CPU map provided my numactl\n");
     if (llama_supports_gpu_offload()) {
         printf("  -ngl N, --n-gpu-layers N\n");
         printf("                            number of layers to store in VRAM\n");
@@ -2264,9 +2267,17 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         {
             params.use_mmap = false;
         }
-        else if (arg == "--numa")
-        {
-            params.numa = true;
+        else if (arg == "--numa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            } else {
+                std::string value(argv[i]);
+                /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+                else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+                else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+                else { invalid_param = true; break; }
+            }
         }
         else if (arg == "--embedding")
         {
@@ -2497,7 +2508,8 @@ int main(int argc, char **argv)
         params.model_alias = params.model;
     }
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
                             {"commit", LLAMA_COMMIT}});
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 9cfde830..39e2d8ea 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -31,7 +31,8 @@ int main(int argc, char ** argv) {
 
     // init LLM
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     // initialize the model
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 7b3af01f..3848791d 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -50,7 +50,8 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
 
     // init llama.cpp
-    llama_backend_init(params.numa);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
     llama_model * model_tgt = NULL;
     llama_model * model_dft = NULL;
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index 4ff8e3fa..d95a9247 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -17,7 +17,7 @@ int main(int argc, char ** argv) {
 
     const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
 
-    llama_backend_init(false);
+    llama_backend_init();
 
     llama_model_params model_params = llama_model_default_params();
     model_params.vocab_only = true;