summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp67
1 files changed, 41 insertions, 26 deletions
diff --git a/llama.cpp b/llama.cpp
index e4db288d..b8bef6da 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2040,6 +2040,11 @@ struct llama_model {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
+#ifdef GGML_USE_CUBLAS
+ if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
+ ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
+ }
+#endif
ggml_backend_buffer_free(buf);
}
}
@@ -5033,6 +5038,13 @@ static bool llm_load_tensors(
size_t first, last;
ml.get_mapping_range(&first, &last, ctx);
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
+#ifdef GGML_USE_CUBLAS
+ if (n_layer >= n_gpu_layers) {
+ ggml_backend_cuda_register_host_buffer(
+ ggml_backend_buffer_get_base(buf),
+ ggml_backend_buffer_get_size(buf));
+ }
+#endif
}
#ifdef GGML_USE_METAL
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
@@ -8231,7 +8243,6 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
- cb(cur, "kqv_out", il);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -8601,12 +8612,15 @@ static struct ggml_cgraph * llama_build_graph(
}
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
- // to fix this, we assign the norm layer manually to the backend of its layer
- if (il != -1 && strcmp(name, "norm") == 0) {
- for (auto * backend : lctx.backends) {
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
- ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
- break;
+ // FIXME: fix in ggml_backend_sched
+ const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
+ if (batch.n_tokens < 32 || full_offload) {
+ if (il != -1 && strcmp(name, "norm") == 0) {
+ for (auto * backend : lctx.backends) {
+ if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
+ ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+ break;
+ }
}
}
}
@@ -13107,27 +13121,25 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(ctx->backend_metal);
}
#elif defined(GGML_USE_CUBLAS)
- if (model->n_gpu_layers > 0) {
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
- ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
+ ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ } else {
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
+ for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
+ ggml_backend_t backend = ggml_backend_cuda_init(device);
if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
- } else {
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
- for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
- ggml_backend_t backend = ggml_backend_cuda_init(device);
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- }
}
}
#elif defined(GGML_USE_VULKAN)
@@ -13285,14 +13297,17 @@ struct llama_context * llama_new_context_with_model(
ggml_backend_t backend = ctx->backends[i];
ggml_backend_buffer_type_t buft = backend_buft[i];
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
- LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
- ggml_backend_buft_name(buft),
- size / 1024.0 / 1024.0);
+ if (size > 1) {
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+ ggml_backend_buft_name(buft),
+ size / 1024.0 / 1024.0);
+ }
}
// note: the number of splits during measure is higher than during inference due to the kv shift
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
- LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
+ LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
}
}