summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2024-06-13 03:11:35 +0200
committerGitHub <noreply@github.com>2024-06-13 03:11:35 +0200
commitf578b86b2123d0f92afbaa98a031df4d4464e582 (patch)
tree2a21feec089e5fcaa6f9d34be5468a17c3a5ddc7 /llama.cpp
parent1c641e6aac5c18b964e7b32d9dbbb4bf5301d0d7 (diff)
move BLAS to a separate backend (#6210)
* move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp37
1 files changed, 25 insertions, 12 deletions
diff --git a/llama.cpp b/llama.cpp
index 8b675ea9..225ea977 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -21,6 +21,10 @@
# include "ggml-kompute.h"
#endif
+#ifdef GGML_USE_BLAS
+# include "ggml-blas.h"
+#endif
+
#ifdef GGML_USE_METAL
# include "ggml-metal.h"
#endif
@@ -2300,8 +2304,12 @@ struct llama_context {
#ifdef GGML_USE_METAL
ggml_backend_t backend_metal = nullptr;
#endif
+#ifdef GGML_USE_BLAS
+ ggml_backend_t backend_blas = nullptr;
+#endif
ggml_backend_t backend_cpu = nullptr;
+
const llama_model & model;
// key + value cache for the self attention
@@ -11529,7 +11537,8 @@ static struct ggml_cgraph * llama_build_graph(
if (batch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
for (auto * backend : lctx.backends) {
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
+ if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
+ (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
break;
}
@@ -12026,6 +12035,11 @@ static void llama_graph_compute(
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
+#ifdef GGML_USE_BLAS
+ if (lctx.backend_blas != nullptr) {
+ ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
+ }
+#endif
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
@@ -12248,17 +12262,6 @@ static int llama_decode_internal(
}
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
- // for big prompts, if BLAS is enabled, it is better to use only one thread
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
- // with the BLAS calls. need a better solution
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
- n_threads = std::min(4, n_threads);
- }
-
ggml_backend_sched_alloc_graph(lctx.sched, gf);
llama_set_inputs(lctx, u_batch);
@@ -16251,6 +16254,16 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(backend);
}
#endif
+
+#ifdef GGML_USE_BLAS
+ ctx->backend_blas = ggml_backend_blas_init();
+ if (ctx->backend_blas == nullptr) {
+ LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
+ } else {
+ ctx->backends.push_back(ctx->backend_blas);
+ }
+#endif
+
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {