ggml : add unified SYCL backend for Intel GPUs (#2690)

* first update for migration * update init_cublas * add debug functio, commit all help code * step 1 * step 2 * step3 add fp16, slower 31->28 * add GGML_LIST_DEVICE function * step 5 format device and print * step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue * support main device is non-zero * step7 add debug for code path, rm log * step 8, rename all macro & func from cuda by sycl * fix error of select non-zero device, format device list * ren ggml-sycl.hpp -> ggml-sycl.h * clear CMAKE to rm unused lib and options * correct queue: rm dtct:get_queue * add print tensor function to debug * fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481 * summary dpct definition in one header file to replace folder:dpct * refactor device log * mv dpct definition from folder dpct to ggml-sycl.h * update readme, refactor build script * fix build with sycl * set nthread=1 when sycl, increase performance * add run script, comment debug code * add ls-sycl-device tool * add ls-sycl-device, rm unused files * rm rear space * dos2unix * Update README_sycl.md * fix return type * remove sycl version from include path * restore rm code to fix hang issue * add syc and link for sycl readme * rm original sycl code before refactor * fix code err * add know issue for pvc hang issue * enable SYCL_F16 support * align pr4766 * check for sycl blas, better performance * cleanup 1 * remove extra endif * add build&run script, clean CMakefile, update guide by review comments * rename macro to intel hardware * editor config format * format fixes * format fixes * editor format fix * Remove unused headers * skip build sycl tool for other code path * replace tab by space * fix blas matmul function * fix mac build * restore hip dependency * fix conflict * ren as review comments * mv internal function to .cpp file * export funciton print_sycl_devices(), mv class dpct definition to source file * update CI/action for sycl code, fix CI error of repeat/dup * fix action ID format issue * rm unused strategy * enable llama_f16 in ci * fix conflict * fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml * fix ci cases for unsupported data type * revert unrelated changed in cuda cmake remove useless nommq fix typo of GGML_USE_CLBLAS_SYCL * revert hip cmake changes * fix indent * add prefix in func name * revert no mmq * rm cpu blas duplicate * fix no_new_line * fix src1->type==F16 bug. * pass batch offset for F16 src1 * fix batch error * fix wrong code * revert sycl checking in test-sampling * pass void as arguments of ggml_backend_sycl_print_sycl_devices * remove extra blank line in test-sampling * revert setting n_threads in sycl * implement std::isinf for icpx with fast math. * Update ci/run.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/sycl/run-llama2.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/sycl/run-llama2.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update CMakeLists.txt Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * add copyright and MIT license declare * update the cmd example --------- Co-authored-by: jianyuzh <jianyu.zhang@intel.com> Co-authored-by: luoyu-intel <yu.luo@intel.com> Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
author: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> 2024-01-28 21:26:23 +0530
committer: GitHub <noreply@github.com> 2024-01-28 17:56:23 +0200
commit: 0f648573dde61c510560f68244f70ece7e60d8c1 (patch)
tree: fc8d92d8b588b1a668b48b8a66ff15e5898d0828 /llama.cpp
parent: b764b8f1d079ba44d912801ce6d29bd0d94d51cf (diff)
1 files changed, 16 insertions, 0 deletions
diff --git a/llama.cpp b/llama.cpp
index 45eeadbe..09d799f7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11,6 +11,8 @@
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
+#elif defined(GGML_USE_SYCL)
+#  include "ggml-sycl.h"
 #endif
 
 #ifdef GGML_USE_METAL
@@ -1278,6 +1280,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
     if (host_buffer) {
         buft = ggml_backend_cuda_host_buffer_type();
     }
+#elif defined(GGML_USE_SYCL)
+    buft = ggml_backend_sycl_host_buffer_type();
 #elif defined(GGML_USE_CPU_HBM)
     buft = ggml_backend_cpu_hbm_buffer_type();
 #endif
@@ -1297,6 +1301,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
     buft = ggml_backend_metal_buffer_type();
 #elif defined(GGML_USE_CUBLAS)
     buft = ggml_backend_cuda_buffer_type(gpu);
+#elif defined(GGML_USE_SYCL)
+    buft = ggml_backend_sycl_buffer_type(gpu);
 #elif defined(GGML_USE_CLBLAST)
     buft = ggml_backend_opencl_buffer_type();
 #endif
@@ -10225,6 +10231,16 @@ struct llama_context * llama_new_context_with_model(
                 }
             }
         }
+#elif defined(GGML_USE_SYCL)
+        if (model->n_gpu_layers > 0) {
+            ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        }
 #endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
author	Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>	2024-01-28 21:26:23 +0530
committer	GitHub <noreply@github.com>	2024-01-28 17:56:23 +0200
commit	0f648573dde61c510560f68244f70ece7e60d8c1 (patch)
tree	fc8d92d8b588b1a668b48b8a66ff15e5898d0828 /llama.cpp
parent	b764b8f1d079ba44d912801ce6d29bd0d94d51cf (diff)