diff options
author | Neo Zhang Jianyu <jianyu.zhang@intel.com> | 2024-03-02 19:49:30 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-02 19:49:30 +0800 |
commit | 715641391dda1ff9762dc5d99d9a30acce99f2c6 (patch) | |
tree | e57b359034b61f8d3ea4de372c2c3c0ec885c943 /llama.cpp | |
parent | 9bf297a02bfbd474e51912409a470dd797e2fe13 (diff) |
Support multiple GPUs (split mode) on SYCL backend (#5806)
* suport multiple cards: split-mode - layer|row
* rm warning
* rebase with master, support tow new OPs, close feature for -sm=row, fix for unit test
* update news
* fix merge error
* update according to review comments
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 49 |
1 files changed, 41 insertions, 8 deletions
@@ -104,6 +104,7 @@ #define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_EXPERTS 8 + // // logging // @@ -1429,7 +1430,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer buft = ggml_backend_cuda_host_buffer_type(); } #elif defined(GGML_USE_SYCL) - buft = ggml_backend_sycl_host_buffer_type(); + if (host_buffer) { + buft = ggml_backend_sycl_host_buffer_type(); + } #elif defined(GGML_USE_CPU_HBM) buft = ggml_backend_cpu_hbm_buffer_type(); #elif defined(GGML_USE_VULKAN) @@ -1483,6 +1486,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g } #endif +#ifdef GGML_USE_SYCL + if (ggml_backend_sycl_get_device_count() > 1) { + buft = ggml_backend_sycl_split_buffer_type(tensor_split); + } +#endif + if (buft == nullptr) { buft = llama_default_buffer_type_offload(fallback_gpu); } @@ -1494,6 +1503,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g static size_t llama_get_device_count() { #if defined(GGML_USE_CUBLAS) return ggml_backend_cuda_get_device_count(); +#elif defined(GGML_USE_SYCL) + return ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) return ggml_backend_vk_get_device_count(); #else @@ -1507,6 +1518,11 @@ static size_t llama_get_device_memory(int device) { size_t free; ggml_backend_cuda_get_device_memory(device, &total, &free); return free; +#elif defined(GGML_USE_SYCL) + size_t total; + size_t free; + ggml_backend_sycl_get_device_memory(device, &total, &free); + return free; #elif defined(GGML_USE_VULKAN) size_t total; size_t free; @@ -12075,13 +12091,31 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_SYCL) if (model->n_gpu_layers > 0) { - ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); - llama_free(ctx); - return nullptr; + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu); + ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + int id_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); + for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { + int device_id = id_list[i]; + ggml_backend_t backend = ggml_backend_sycl_init(i); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } } - ctx->backends.push_back(backend); } #elif defined(GGML_USE_KOMPUTE) if (model->n_gpu_layers > 0) { @@ -12161,7 +12195,6 @@ struct llama_context * llama_new_context_with_model( ggml_set_name(ctx->inp_cls, "inp_cls"); ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); - LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(ctx->buf_input), ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0); |