summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorNeo Zhang Jianyu <jianyu.zhang@intel.com>2024-03-02 19:49:30 +0800
committerGitHub <noreply@github.com>2024-03-02 19:49:30 +0800
commit715641391dda1ff9762dc5d99d9a30acce99f2c6 (patch)
treee57b359034b61f8d3ea4de372c2c3c0ec885c943 /llama.cpp
parent9bf297a02bfbd474e51912409a470dd797e2fe13 (diff)
Support multiple GPUs (split mode) on SYCL backend (#5806)
* suport multiple cards: split-mode - layer|row * rm warning * rebase with master, support tow new OPs, close feature for -sm=row, fix for unit test * update news * fix merge error * update according to review comments
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp49
1 files changed, 41 insertions, 8 deletions
diff --git a/llama.cpp b/llama.cpp
index b1db5b17..cb6266a4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -104,6 +104,7 @@
#define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_EXPERTS 8
+
//
// logging
//
@@ -1429,7 +1430,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
buft = ggml_backend_cuda_host_buffer_type();
}
#elif defined(GGML_USE_SYCL)
- buft = ggml_backend_sycl_host_buffer_type();
+ if (host_buffer) {
+ buft = ggml_backend_sycl_host_buffer_type();
+ }
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)
@@ -1483,6 +1486,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
}
#endif
+#ifdef GGML_USE_SYCL
+ if (ggml_backend_sycl_get_device_count() > 1) {
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
+ }
+#endif
+
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(fallback_gpu);
}
@@ -1494,6 +1503,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
static size_t llama_get_device_count() {
#if defined(GGML_USE_CUBLAS)
return ggml_backend_cuda_get_device_count();
+#elif defined(GGML_USE_SYCL)
+ return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
@@ -1507,6 +1518,11 @@ static size_t llama_get_device_memory(int device) {
size_t free;
ggml_backend_cuda_get_device_memory(device, &total, &free);
return free;
+#elif defined(GGML_USE_SYCL)
+ size_t total;
+ size_t free;
+ ggml_backend_sycl_get_device_memory(device, &total, &free);
+ return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
@@ -12075,13 +12091,31 @@ struct llama_context * llama_new_context_with_model(
}
#elif defined(GGML_USE_SYCL)
if (model->n_gpu_layers > 0) {
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
- llama_free(ctx);
- return nullptr;
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+ int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ } else {
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
+ int id_list[GGML_SYCL_MAX_DEVICES];
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
+ int device_id = id_list[i];
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ }
}
- ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) {
@@ -12161,7 +12195,6 @@ struct llama_context * llama_new_context_with_model(
ggml_set_name(ctx->inp_cls, "inp_cls");
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
-
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
ggml_backend_buffer_name(ctx->buf_input),
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);