diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-07-27 07:55:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-27 07:55:01 +0200 |
commit | 154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch) | |
tree | 81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /ggml/src/ggml-sycl | |
parent | 0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff) |
Merge mainline llama.cpp (#3)
* Merging mainline - WIP
* Merging mainline - WIP
AVX2 and CUDA appear to work.
CUDA performance seems slightly (~1-2%) lower as it is so often
the case with llama.cpp/ggml after some "improvements" have been made.
* Merging mainline - fix Metal
* Remove check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-sycl')
-rw-r--r-- | ggml/src/ggml-sycl/backend.hpp | 27 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/common.cpp | 53 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/common.hpp | 355 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/concat.cpp | 195 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/concat.hpp | 21 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/convert.cpp | 547 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/convert.hpp | 27 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/dequantize.hpp | 698 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/dmmv.cpp | 1023 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/dmmv.hpp | 27 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/dpct/helper.hpp | 3011 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/mmq.cpp | 3031 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/mmq.hpp | 33 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/mmvq.cpp | 1027 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/mmvq.hpp | 27 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/norm.cpp | 374 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/norm.hpp | 35 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/presets.hpp | 66 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/rope.cpp | 275 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/rope.hpp | 22 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/softmax.cpp | 251 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/softmax.hpp | 24 | ||||
-rw-r--r-- | ggml/src/ggml-sycl/vecdotq.hpp | 1140 |
23 files changed, 12289 insertions, 0 deletions
diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp new file mode 100644 index 00000000..067181de --- /dev/null +++ b/ggml/src/ggml-sycl/backend.hpp @@ -0,0 +1,27 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_BACKEND_HPP +#define GGML_SYCL_BACKEND_HPP + +#include "concat.hpp" +#include "common.hpp" +#include "convert.hpp" +#include "dequantize.hpp" +#include "dmmv.hpp" +#include "mmq.hpp" +#include "mmvq.hpp" +#include "rope.hpp" +#include "norm.hpp" +#include "softmax.hpp" + +#endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp new file mode 100644 index 00000000..e878f4f5 --- /dev/null +++ b/ggml/src/ggml-sycl/common.cpp @@ -0,0 +1,53 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "common.hpp" + +int get_current_device_id() { + return dpct::dev_mgr::instance().current_device_id(); +} + +void* ggml_sycl_host_malloc(size_t size) try { + if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { + return nullptr; + } + + void* ptr = nullptr; + // allow to use dpct::get_in_order_queue() for host malloc + dpct::err0 err = CHECK_TRY_ERROR( + ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue())); + + if (err != 0) { + // clear the error + fprintf( + stderr, + "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + size / 1024.0 / 1024.0, + "syclGetErrorString is not supported"); + return nullptr; + } + + return ptr; +} catch (sycl::exception const& exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_host_free(void* ptr) try { + // allow to use dpct::get_in_order_queue() for host malloc + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue()))); +} catch (sycl::exception const& exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp new file mode 100644 index 00000000..397bd98d --- /dev/null +++ b/ggml/src/ggml-sycl/common.hpp @@ -0,0 +1,355 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_COMMON_HPP +#define GGML_SYCL_COMMON_HPP + +#include <fstream> +#include <iostream> + +#include "dpct/helper.hpp" +#include "ggml-sycl.h" +#include "presets.hpp" + +#define GGML_COMMON_DECL_SYCL +#define GGML_COMMON_IMPL_SYCL +#include "ggml-common.h" + +void* ggml_sycl_host_malloc(size_t size); +void ggml_sycl_host_free(void* ptr); + +static int g_ggml_sycl_debug = 0; +#define GGML_SYCL_DEBUG(...) \ + do { \ + if (g_ggml_sycl_debug) \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) + +#define CHECK_TRY_ERROR(expr) \ + [&]() { \ + try { \ + expr; \ + return dpct::success; \ + } catch (std::exception const& e) { \ + std::cerr << e.what() << "\nException caught at file:" << __FILE__ \ + << ", line:" << __LINE__ << ", func:" << __func__ \ + << std::endl; \ + return dpct::default_error; \ + } \ + }() + + +#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP +#define VER_4VEC 610 // todo for hardward optimize. +#define VER_GEN9 700 // todo for hardward optimize. +#define VER_GEN12 1000000 // todo for hardward optimize. +#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize. + +#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares + +// define for XMX in Intel GPU +// TODO: currently, it's not used for XMX really. +#if !defined(GGML_SYCL_FORCE_MMQ) + #define SYCL_USE_XMX +#endif + +// max batch size to use MMQ kernels when tensor cores are available +#define MMQ_MAX_BATCH_SIZE 32 + +#if defined(_MSC_VER) +#pragma warning(disable : 4244 4267) // possible loss of data +#endif + +// dmmv = dequantize_mul_mat_vec +#ifndef GGML_SYCL_DMMV_X +#define GGML_SYCL_DMMV_X 32 +#endif +#ifndef GGML_SYCL_MMV_Y +#define GGML_SYCL_MMV_Y 1 +#endif + +typedef sycl::queue *queue_ptr; + +enum ggml_sycl_backend_gpu_mode { + SYCL_UNSET_GPU_MODE = -1, + SYCL_SINGLE_GPU_MODE = 0, + SYCL_MUL_GPU_MODE +}; + +static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +static void crash() { + int* ptr = NULL; + *ptr = 0; +} + +[[noreturn]] static void ggml_sycl_error( + const char* stmt, + const char* func, + const char* file, + const int line, + const char* msg) { + fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg); + fprintf(stderr, " in function %s at %s:%d\n", func, file, line); + GGML_ASSERT(!"SYCL error"); +} + +#define SYCL_CHECK(err) \ + do { \ + auto err_ = (err); \ + if (err_ != 0) \ + ggml_sycl_error( \ + #err, \ + __func__, \ + __FILE__, \ + __LINE__, \ + "Meet error in this line code!"); \ + } while (0) + +#if DPCT_COMPAT_RT_VERSION >= 11100 +#define GGML_SYCL_ASSUME(x) __builtin_assume(x) +#else +#define GGML_SYCL_ASSUME(x) +#endif // DPCT_COMPAT_RT_VERSION >= 11100 + +#ifdef GGML_SYCL_F16 +typedef sycl::half dfloat; // dequantize float +typedef sycl::half2 dfloat2; +#else +typedef float dfloat; // dequantize float +typedef sycl::float2 dfloat2; +#endif // GGML_SYCL_F16 + +#define MMVQ_MAX_BATCH_SIZE 8 + +static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + +static int g_all_sycl_device_count = -1; +static bool g_ggml_backend_sycl_buffer_type_initialized = false; + +static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = + SYCL_UNSET_GPU_MODE; + +static void* g_scratch_buffer = nullptr; +static size_t g_scratch_size = 0; // disabled by default +static size_t g_scratch_offset = 0; + +[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) { + stream_ct1 << "ERROR: ggml-sycl was compiled without support for the " + "current GPU architecture.\n"; + // __trap(); + std::exit(1); + + (void)bad_arch; // suppress unused function warning +} + +int get_current_device_id(); + +inline dpct::err0 ggml_sycl_set_device(const int device) try { + + int current_device_id; + SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id())); + + // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, + // current_device_id=%d\n", device, current_device); + if (device == current_device_id) { + return 0; + } + + return CHECK_TRY_ERROR(dpct::select_device(device)); +} catch (sycl::exception const& exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + crash(); + std::exit(1); +} + +////////////////////// + +struct ggml_sycl_device_info { + int device_count; + + struct sycl_device_info { + int cc; // compute capability + // int nsm; // number of streaming multiprocessors + // size_t smpb; // max. shared memory per block + bool vmm; // virtual memory support + size_t total_vram; + }; + + sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {}; + + std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {}; + + int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0}; +}; + +const ggml_sycl_device_info & ggml_sycl_info(); + +struct ggml_sycl_pool { + virtual ~ggml_sycl_pool() = default; + + virtual void * alloc(size_t size, size_t * actual_size) = 0; + virtual void free(void * ptr, size_t size) = 0; +}; + +template<typename T> +struct ggml_sycl_pool_alloc { + ggml_sycl_pool * pool = nullptr; + T * ptr = nullptr; + size_t actual_size = 0; + + explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) { + } + + ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) { + alloc(size); + } + + ~ggml_sycl_pool_alloc() { + if (ptr != nullptr) { + pool->free(ptr, actual_size); + } + } + + // size is in number of elements + T * alloc(size_t size) { + GGML_ASSERT(pool != nullptr); + GGML_ASSERT(ptr == nullptr); + ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); + return ptr; + } + + T * alloc(ggml_sycl_pool & pool, size_t size) { + this->pool = &pool; + return alloc(size); + } + + T * get() { + return ptr; + } + + ggml_sycl_pool_alloc() = default; + ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete; + ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete; + ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete; + ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete; +}; + +// backend interface + +struct ggml_tensor_extra_gpu { + void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split + // tensors + dpct::event_ptr events[GGML_SYCL_MAX_DEVICES] + [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs +}; + +struct ggml_backend_sycl_context { + int device; + std::string name; + + queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; + + explicit ggml_backend_sycl_context(int device) : + device(device), + name(GGML_SYCL_NAME + std::to_string(device)) { + } + + queue_ptr stream(int device, int stream) { + if (qptrs[device][stream] == nullptr) { + qptrs[device][stream] = &(dpct::get_device(device).default_queue()); + } + return qptrs[device][stream]; + } + + queue_ptr stream() { + return stream(device, 0); + } + + // pool + std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES]; + + static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device); + + ggml_sycl_pool & pool(int device) { + if (pools[device] == nullptr) { + pools[device] = new_pool_for_device(stream(device,0), device); + } + return *pools[device]; + } + + ggml_sycl_pool & pool() { + return pool(device); + } +}; + +// common device functions + +static __dpct_inline__ float warp_reduce_sum(float x, + const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + /* + DPCT1096:98: The right-most dimension of the work-group used in the SYCL + kernel that calls this function may be less than "32". The function + "dpct::permute_sub_group_by_xor" may return an unexpected result on the + CPU device. Modify the size of the work-group to ensure that the value + of the right-most dimension is a multiple of "32". + */ + x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask); + } + return x; +} + +static __dpct_inline__ sycl::float2 +warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), + mask); + a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), + mask); + } + return a; +} + +static __dpct_inline__ float warp_reduce_max(float x, + const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + /* + DPCT1096:97: The right-most dimension of the work-group used in the SYCL + kernel that calls this function may be less than "32". The function + "dpct::permute_sub_group_by_xor" may return an unexpected result on the + CPU device. Modify the size of the work-group to ensure that the value + of the right-most dimension is a multiple of "32". + */ + x = sycl::fmax(x, dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), x, mask)); + } + return x; +} + +// Helper for vec loading aligned data +template <typename Tp, int n> +inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) { + return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr); +} + +// Helper for accessing pointers with no warnings +template <typename Tp, int dim> +static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) { + return acc.template get_multi_ptr<sycl::access::decorated::no>().get(); +} + +#endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp new file mode 100644 index 00000000..632eedb9 --- /dev/null +++ b/ggml/src/ggml-sycl/concat.cpp @@ -0,0 +1,195 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "concat.hpp" +#include "common.hpp" + +static void concat_f32_dim0(const float *x, const float *y, float *dst, + const int ne0, const int ne00, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (nidx < ne00) { // src0 + int offset_src = nidx + item_ct1.get_group(1) * ne00 + + item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) + + item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_dim1(const float *x, const float *y, float *dst, + const int ne0, const int ne01, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (item_ct1.get_group(1) < ne01) { // src0 + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + (item_ct1.get_group(1) - ne01) * ne0 + + item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_dim2(const float *x, const float *y, float *dst, + const int ne0, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (item_ct1.get_group(0) < ne02) { // src0 + int offset_src = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + + (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_sycl(const float *x, const float *y, float *dst, + int ne00, int ne01, int ne02, int ne0, int ne1, + int ne2, int dim, queue_ptr stream) { + int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE; + sycl::range<3> gridDim(ne2, ne1, num_blocks); + switch (dim) { + case 0: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); + }); + break; + case 1: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); + }); + break; + default: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); + }); + break; + } +} + +// non-contiguous kernel (slow) +static void concat_f32_sycl_non_cont( + queue_ptr stream, const char *src0, const char *src1, char *dst, + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00, + uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/, + int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10, + uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1, + int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2, + uint64_t nb3, int32_t dim) { + sycl::range<3> gridDim(ne3, ne2, ne1); + stream->parallel_for( + sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + int64_t i3 = item_ct1.get_group(0); + int64_t i2 = item_ct1.get_group(1); + int64_t i1 = item_ct1.get_group(2); + + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); + + const float *x; + + for (int i0 = item_ct1.get_local_id(2); i0 < ne0; + i0 += item_ct1.get_local_range(2)) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 + + (i0)*nb00); + } else { + x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + + (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10); + } + + float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0); + + *y = *x; + } + }); +} + +void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst) { + queue_ptr stream = ctx.stream(); + + const int32_t dim = ((int32_t *)dst->op_params)[0]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + const float *src0_d = (const float *)src0->data; + const float *src1_d = (const float *)src1->data; + + float *dst_d = (float *)dst->data; + + if (dim != 3) { + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_sycl( + src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), + dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], + src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); + } + } else { + const size_t size0 = ggml_nbytes(src0); + const size_t size1 = ggml_nbytes(src1); + + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait())); + SYCL_CHECK(CHECK_TRY_ERROR( + stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait())); + } + } else + concat_f32_sycl_non_cont( + stream, (const char *)src0->data, (const char *)src1->data, + (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0], + src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], + src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], + dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim); +} diff --git a/ggml/src/ggml-sycl/concat.hpp b/ggml/src/ggml-sycl/concat.hpp new file mode 100644 index 00000000..5a04feaa --- /dev/null +++ b/ggml/src/ggml-sycl/concat.hpp @@ -0,0 +1,21 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONCAT_HPP +#define GGML_SYCL_CONCAT_HPP + +#include "common.hpp" + +void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst); + +#endif // GGML_SYCL_CONCAT_HPP diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp new file mode 100644 index 00000000..39c28753 --- /dev/null +++ b/ggml/src/ggml-sycl/convert.cpp @@ -0,0 +1,547 @@ +#include "convert.hpp" +#include "dequantize.hpp" +#include "presets.hpp" + +template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t> +static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2)); + + if (i >= k) { + return; + } + + const int ib = i/qk; // block index + const int iqs = (i%qk)/qr; // quant index + const int iybs = i - i%qk; // y block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(vx, ib, iqs, v); + + y[iybs + iqs + 0] = v.x(); + y[iybs + iqs + y_offset] = v.y(); +} + +template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t> +static void dequantize_block_sycl(const void *__restrict__ vx, + dst_t *__restrict__ y, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1); + }); + } +} + +template <typename dst_t> +static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q2_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q2_K(vx, y, item_ct1); + }); + } + +#endif +} + +template <typename dst_t> +static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K(vx, y, item_ct1); + }); + } +#endif +} + +template <typename dst_t> +static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_0(vx, y, nb32, item_ct1); + }); + } +} + +template <typename dst_t> +static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_1(vx, y, nb32, item_ct1); + }); + } +} + + +template <typename dst_t> +static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1); + }); + }); + } +} + +template <typename dst_t> +static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q5_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q5_K(vx, y, item_ct1); + }); + } + +#endif +} + +template <typename dst_t> +static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q6_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q6_K(vx, y, item_ct1); + }); + } + +#endif +} + +template <typename dst_t> +static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_s( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} + +template <typename dst_t> +static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_m( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} + +template <typename dst_t> +static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_xxs( + vx, y, item_ct1, iq2xxs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template <typename dst_t> +static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_xs( + vx, y, item_ct1, iq2xs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template <typename dst_t> +static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_s(vx, y, item_ct1); + }); + }); + } +} + + +template <typename dst_t> +static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq3_xxs( + vx, y, item_ct1, iq3xxs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template <typename dst_t> +static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq3_s( + vx, y, item_ct1, kmask_iq2xs, iq3s_grid); + }); + }); + } +} + +template <typename dst_t> +static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = (k + QK_K - 1) / QK_K; +#if QK_K == 64 + dequantize_row_iq4_nl_sycl(vx, y, k, stream); +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_xs(vx, y, item_ct1); + }); + }); + } +#endif +} + +template <typename dst_t> +static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = (k + QK_K - 1) / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_nl(vx, y, item_ct1); + }); + }); + } +} + +template <typename src_t, typename dst_t> +static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + const src_t * x = (src_t *) vx; + + y[i] = x[i]; +} + +template <typename src_t, typename dst_t> +static void convert_unary_sycl(const void *__restrict__ vx, + dst_t *__restrict__ y, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + convert_unary<src_t>(vx, y, k, item_ct1); + }); + } +} + +to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>; + case GGML_TYPE_Q4_1: + return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>; + case GGML_TYPE_Q5_0: + return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>; + case GGML_TYPE_Q5_1: + return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>; + case GGML_TYPE_Q8_0: + return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_sycl; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_sycl; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_sycl; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_sycl; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; + case GGML_TYPE_IQ2_XXS: + return dequantize_row_iq2_xxs_sycl; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_sycl; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; + case GGML_TYPE_F32: + return convert_unary_sycl<float>; + default: + return nullptr; + } +} + +to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_row_q4_0_sycl; + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_sycl; + case GGML_TYPE_Q5_0: + return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>; + case GGML_TYPE_Q5_1: + return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>; + case GGML_TYPE_Q8_0: + return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_sycl; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_sycl; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_sycl; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_sycl; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; + case GGML_TYPE_IQ2_XXS: + return dequantize_row_iq2_xxs_sycl; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_sycl; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; + case GGML_TYPE_F16: + return convert_unary_sycl<sycl::half>; + default: + return nullptr; + } +} diff --git a/ggml/src/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp new file mode 100644 index 00000000..b1f10d63 --- /dev/null +++ b/ggml/src/ggml-sycl/convert.hpp @@ -0,0 +1,27 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONVERT_HPP +#define GGML_SYCL_CONVERT_HPP + +#include "common.hpp" + +template <typename T> +using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y, + int k, dpct::queue_ptr stream); +typedef to_t_sycl_t<float> to_fp32_sycl_t; +typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t; + +to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type); +to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type); + +#endif // GGML_SYCL_CONVERT_HPP diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp new file mode 100644 index 00000000..ed8ad098 --- /dev/null +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -0,0 +1,698 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_DEQUANTIZE_HPP +#define GGML_SYCL_DEQUANTIZE_HPP + +#include "common.hpp" + +typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); + +static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = x[ib].d; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_SYCL_F16 + // v = v - {8.0f, 8.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 8.0f) * d; + v.s1() = (v.s1() - 8.0f) * d; + +#else + v.x() = (v.x() - 8.0f) * d; + v.y() = (v.y() - 8.0f) * d; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + // v = v + {m, m}; + v.s0() = (v.s0() * d) + m; + v.s1() = (v.s1() * d) + m; + +#else + v.x() = (v.x() * d) + m; + v.y() = (v.y() * d) + m; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q5_0 * x = (const block_q5_0 *) vx; + + const dfloat d = x[ib].d; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_SYCL_F16 + // v = v - {16.0f, 16.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 16.0f) * d; + v.s1() = (v.s1() - 16.0f) * d; + +#else + v.x() = (v.x() - 16.0f) * d; + v.y() = (v.y() - 16.0f) * d; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q5_1 * x = (const block_q5_1 *) vx; + + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + // v = v + {m, m}; + v.s0() = (v.s0() * d) + m; + v.s1() = (v.s1() * d) + m; +#else + v.x() = (v.x() * d) + m; + v.y() = (v.y() * d) + m; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q8_0 * x = (const block_q8_0 *) vx; + + const dfloat d = x[ib].d; + + v.x() = x[ib].qs[iqs + 0]; + v.y() = x[ib].qs[iqs + 1]; + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + v.s0() *= d; + v.s1() *= d; +#else + v.x() *= d; + v.y() *= d; +#endif // GGML_SYCL_F16 +} + +template<typename dst_t> +static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_0 * x = (const block_q4_0 *)vx + ib; + const float d = sycl::vec<sycl::half, 1>(x->d) + .convert<float, sycl::rounding_mode::automatic>()[0]; + const float dm = -8*d; + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d * (q[l] & 0xF) + dm; + y[l+16] = d * (q[l] >> 4) + dm; + } +} + +template<typename dst_t> +static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_1 * x = (const block_q4_1 *)vx + ib; + const sycl::float2 d = + x->dm.convert<float, sycl::rounding_mode::automatic>(); + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l + 0] = d.x() * (q[l] & 0xF) + d.y(); + y[l + 16] = d.x() * (q[l] >> 4) + d.y(); + } +} + + +//================================== k-quants + +template<typename dst_t> +static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_q2_K * x = (const block_q2_K *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int n = tid/32; + const int l = tid - 32*n; + const int is = 8*n + l/16; + + const uint8_t q = x[i].qs[32*n + l]; + dst_t * y = yy + i*QK_K + 128*n; + + float dall = x[i].dm[0]; + float dmin = x[i].dm[1]; + y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); + y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); + y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); +#else + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const uint8_t q = x[i].qs[il] >> (2*is); + dst_t * y = yy + i*QK_K + 16*is + il; + + float dall = x[i].dm[0]; + float dmin = x[i].dm[1]; + y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); +#endif + +} + +template<typename dst_t> +static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_q3_K * x = (const block_q3_K *) vx; + +#if QK_K == 256 + const int r = item_ct1.get_local_id(2) / 4; + const int tid = r/2; + const int is0 = r%2; + const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4); + const int n = tid / 4; + const int j = tid - 4*n; + + uint8_t m = 1 << (4*n + j); + int is = 8*n + 2*j + is0; + int shift = 2*j; + + int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : + (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); + float d_all = x[i].d; + float dl = d_all * (us - 32); + + dst_t * y = yy + i*QK_K + 128*n + 32*j; + const uint8_t * q = x[i].qs + 32*n; + const uint8_t * hm = x[i].hmask; + + for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +#else + const int tid = item_ct1.get_local_id(2); + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const int im = il/8; // 0...1 + const int in = il%8; // 0...7 + + dst_t * y = yy + i*QK_K + 16*is + il; + + const uint8_t q = x[i].qs[il] >> (2*is); + const uint8_t h = x[i].hmask[in] >> (2*is + im); + const float d = (float)x[i].d; + + if (is == 0) { + y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } else { + y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } +#endif + +} + +#if QK_K == 256 +static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { + if (j < 4) { + d = q[j] & 63; + m = q[j + 4] & 63; + } else { + d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} +#endif + +template<typename dst_t> +static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) { + const block_q4_K * x = (const block_q4_K *) vx; + + const int i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; + const int ir = tid%8; + const int is = 2*il; + const int n = 4; + + dst_t * y = yy + i*QK_K + 64*il + n*ir; + + const sycl::half2 dm = x[i].dm; + const float dall = dm[0]; + const float dmin = dm[1]; + + if (tid < 12) + scales_local[tid] = x[i].scales[tid]; + item_ct1.barrier(sycl::access::fence_space::local_space); + + uint8_t sc, m; + get_scale_min_k4(is + 0, scales_local, sc, m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, scales_local, sc, m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir); + for (int l = 0; l < n; ++l) { + y[l + 0] = d1 * (q_vec[l] & 0xF) - m1; + y[l +32] = d2 * (q_vec[l] >> 4) - m2; + } +#else + const int tid = item_ct1.get_local_id(2); + const uint8_t * q = x[i].qs; + dst_t * y = yy + i*QK_K; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); + y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); +#endif +} + +template<typename dst_t> +static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q5_K * x = (const block_q5_K *) vx; + + const int i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 64 threads - this is very slightly better than the one below + const int tid = item_ct1.get_local_id(2); + const int il = tid/16; // il is in 0...3 + const int ir = tid%16; // ir is in 0...15 + const int is = 2*il; // is is in 0...6 + + dst_t * y = yy + i*QK_K + 64*il + 2*ir; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint8_t * ql = x[i].qs + 32*il + 2*ir; + const uint8_t * qh = x[i].qh + 2*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + + uint8_t hm = 1 << (2*il); + y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; + y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; +#else + const int tid = item_ct1.get_local_id(2); + const uint8_t q = x[i].qs[tid]; + const int im = tid/8; // 0...3 + const int in = tid%8; // 0...7 + const int is = tid/16; // 0 or 1 + const uint8_t h = x[i].qh[in] >> im; + const float d = x[i].d; + dst_t * y = yy + i*QK_K + tid; + y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); + y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); +#endif +} + +template<typename dst_t> +static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q6_K * x = (const block_q6_K *) vx; + + const int i = item_ct1.get_group(2); +#if QK_K == 256 + + // assume 64 threads - this is very slightly better than the one below + const int tid = item_ct1.get_local_id(2); + const int ip = tid/32; // ip is 0 or 1 + const int il = tid - 32*ip; // 0...32 + const int is = 8*ip + il/16; + + dst_t * y = yy + i*QK_K + 128*ip + il; + + const float d = x[i].d; + + const uint8_t * ql = x[i].ql + 64*ip + il; + const uint8_t qh = x[i].qh[32*ip + il]; + const int8_t * sc = x[i].scales + is; + + y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +#else + + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int ip = tid/16; // 0 or 1 + const int il = tid - 16*ip; // 0...15 + + dst_t * y = yy + i*QK_K + 16*ip + il; + + const float d = x[i].d; + + const uint8_t ql = x[i].ql[16*ip + il]; + const uint8_t qh = x[i].qh[il] >> (2*ip); + const int8_t * sc = x[i].scales; + + y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); +#endif +} + +template<typename dst_t> +static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2xxs_grid_ptr, + const uint8_t *ksigns_iq2xs_ptr, + const uint8_t *kmask_iq2xs_ptr) { + + const int i = item_ct1.get_group(2); + const block_iq2_xxs * x = (const block_iq2_xxs *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * aux8 = (const uint8_t *)q2; + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]); + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; + const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f); +#else + assert(false); +#endif + +} + +template<typename dst_t> +static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2xs_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + + const int i = item_ct1.get_group(2); + const block_iq2_xs * x = (const block_iq2_xs *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); +#endif + +} + +template <typename dst_t> +__dpct_inline__ static void +dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_iq2_s * x = (const block_iq2_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; +#pragma unroll + for (int j = 0; j < 8; ++j) + y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); + +#endif + +} + +template<typename dst_t> +static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq3xxs_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + + const int i = item_ct1.get_group(2); + const block_iq3_xxs * x = (const block_iq3_xxs *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * q3 = x[i].qs + 8*ib; + const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f; + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + +template <typename dst_t> +__dpct_inline__ static void +dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) { + + const int i = item_ct1.get_group(2); + const block_iq3_s * x = (const block_iq3_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); + const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); + const uint8_t signs = x[i].signs[4*ib + il]; +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + +template <typename dst_t> +__dpct_inline__ static void +dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + + const int i = item_ct1.get_group(2); + const block_iq1_s * x = (const block_iq1_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; + const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + assert(false); +#endif + +} + +template <typename dst_t> +__dpct_inline__ static void +dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + + const int i = item_ct1.get_group(2); + const block_iq1_m * x = (const block_iq1_m *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * sc = (const uint16_t *)x[i].scales; + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); + const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); + const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + assert(false); +#endif + +} + +template <typename dst_t> +__dpct_inline__ static void +dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); + + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } + +} + + +template <typename dst_t> +__dpct_inline__ static void +dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_group(2); + const block_iq4_xs * x = (const block_iq4_xs *)vx; + + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[i].qs + 16*ib + 4*il; + const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } +} + + +#endif // GGML_SYCL_DEQUANTIZE_HPP diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp new file mode 100644 index 00000000..70a94fc1 --- /dev/null +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -0,0 +1,1023 @@ +#include "convert.hpp" +#include "dmmv.hpp" +#include "dequantize.hpp" +#include "presets.hpp" + + +static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const sycl::half *x = (const sycl::half *)vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const float * x = (const float *) vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +template <int qk, int qr, dequantize_kernel_t dequantize_kernel> +static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int tid = item_ct1.get_local_id(2); + + const int iter_stride = 2*GGML_SYCL_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_SYCL_F16 + sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_SYCL_F16 + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_SYCL_F16 + dfloat2 t1{y[iybs + iqs + j / qr + 0], + y[iybs + iqs + j / qr + y_offset]}; + + tmp += v * t1; +#else + tmp += v.x() * y[iybs + iqs + j / qr + 0]; + tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; +#endif // GGML_SYCL_F16 + } + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { +#ifdef GGML_SYCL_F16 + dst[row] = tmp.x() + tmp.y(); +#else + dst[row] = tmp; +#endif // GGML_SYCL_F16 + } +} + +static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, + nrows, item_ct1); + }); + } +} + +/* +DPCT1110:4: The total declared local variable size in device function +dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q2_K * x = (const block_q2_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 16/K_QUANTS_PER_ITERATION; + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } +#else + const int tid = item_ct1.get_local_id(2) / + (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = item_ct1.get_local_id(2) % + (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; + + uint32_t uaux[2]; + const uint8_t * d = (const uint8_t *)uaux; + + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint32_t * s = (const uint32_t *)x[i].scales; + + uaux[0] = s[0] & 0x0f0f0f0f; + uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; + + const sycl::float2 dall = + x[i].dm.convert<float, sycl::rounding_mode::automatic>(); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t ql = q[l]; + sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + + y[l+16] * d[1] * ((ql >> 2) & 3) + + y[l+32] * d[2] * ((ql >> 4) & 3) + + y[l+48] * d[3] * ((ql >> 6) & 3); + sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; + } + tmp += dall.x() * sum1 - dall.y() * sum2; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:5: The total declared local variable size in device function +dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } +#else + + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint8_t * s = x[i].scales; + + const float dall = (float)x[i].d; + + float sum = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t hl = x[i].hmask[im+l] >> in; + const uint8_t ql = q[l]; + sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) + + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) + + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) + + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:6: The total declared local variable size in device function +dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; + s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - + dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif + + } +#else + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); + + const int step = tid * K_QUANTS_PER_ITERATION; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + float tmp = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const float * y = yy + i*QK_K + step; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) + + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); + } + tmp += sum; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:7: The total declared local variable size in device function +dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = item_ct1.get_local_id(2) / 2; // 0...15 + const int ix = item_ct1.get_local_id(2) % 2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x() += + y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + + y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); + sum.y() += + y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + + y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); + sum.z() += + y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + + y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); + sum.w() += + y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + + y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + + sum.w() * sc[5]) - + dmin * smin; + } + +#else + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + +#if QK_K == 256 + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + +#else + + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3 + + const int step = tid * K_QUANTS_PER_ITERATION; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + step; + const uint8_t * ql = x[i].ql + step; + const uint8_t * qh = x[i].qh + step; + const int8_t * s = x[i].scales; + + const float d = x[i+0].d; + + float sum = 0; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) + + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); + } + tmp += sum; + + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + + +static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +void ggml_sycl_op_dequantize_mul_mat_vec( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) { + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_SYCL_F16 + ggml_sycl_pool_alloc<sycl::half> src1_dfloat_a(ctx.pool()); + sycl::half *src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = src1_dfloat_a.alloc(ne00); + const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); + GGML_ASSERT(to_fp16_sycl != nullptr); + to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_SYCL_F16 + + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + default: + printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddq_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} diff --git a/ggml/src/ggml-sycl/dmmv.hpp b/ggml/src/ggml-sycl/dmmv.hpp new file mode 100644 index 00000000..bd837356 --- /dev/null +++ b/ggml/src/ggml-sycl/dmmv.hpp @@ -0,0 +1,27 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_DMMV_HPP +#define GGML_SYCL_DMMV_HPP + +#include "common.hpp" + + +void ggml_sycl_op_dequantize_mul_mat_vec( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream); + +#endif // GGML_SYCL_DMMV_HPP diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp new file mode 100644 index 00000000..4aaa76bf --- /dev/null +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -0,0 +1,3011 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_DPCT_HELPER_HPP +#define GGML_SYCL_DPCT_HELPER_HPP + +#include <sycl/sycl.hpp> +#include <sycl/half_type.hpp> +#include <oneapi/mkl.hpp> +#include <map> + +#include "ggml.h" + +#if defined(__linux__) +#include <sys/mman.h> +#elif defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include <windows.h> +#else +#error "Only support Windows and Linux." +#endif + +#if defined(__linux__) +#include <unistd.h> +#include <sys/syscall.h> +#endif +#if defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include <windows.h> +#endif + +#define DPCT_COMPATIBILITY_TEMP (900) + +#if defined(_MSC_VER) +#define __dpct_align__(n) __declspec(align(n)) +#define __dpct_inline__ __forceinline +#else +#define __dpct_align__(n) __attribute__((aligned(n))) +#define __dpct_inline__ __inline__ __attribute__((always_inline)) +#endif + +#if defined(_MSC_VER) +#define __dpct_noinline__ __declspec(noinline) +#else +#define __dpct_noinline__ __attribute__((noinline)) +#endif + +inline std::string get_device_type_name(const sycl::device &Device) { + auto DeviceType = Device.get_info<sycl::info::device::device_type>(); + switch (DeviceType) { + case sycl::info::device_type::cpu: + return "cpu"; + case sycl::info::device_type::gpu: + return "gpu"; + case sycl::info::device_type::host: + return "host"; + case sycl::info::device_type::accelerator: + return "acc"; + default: + return "unknown"; + } +} + +inline std::string get_device_backend_and_type(const sycl::device &device) { + std::stringstream device_type; + sycl::backend backend = device.get_backend(); + device_type << backend << ":" << get_device_type_name(device); + return device_type.str(); +} + +namespace dpct +{ + typedef sycl::queue *queue_ptr; + typedef sycl::event *event_ptr; + typedef char *device_ptr; + typedef uint8_t byte_t; + typedef sycl::buffer<byte_t> buffer_t; + + /// SYCL default exception handler + inline auto exception_handler = [](sycl::exception_list exceptions) + { + for (std::exception_ptr const &e : exceptions) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception const &e) + { + std::cerr << "Caught asynchronous SYCL exception:" << std::endl + << e.what() << std::endl + << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + } + } + }; + + enum error_code + { + success = 0, + default_error = 999 + }; + + enum memcpy_direction + { + host_to_host, + host_to_device, + device_to_host, + device_to_device, + automatic + }; + + enum memory_region + { + global = 0, // device global memory + constant, // device constant memory + local, // device local memory + shared, // memory which can be accessed by host and device + }; + + enum class library_data_t : unsigned char + { + real_float = 0, + complex_float, + real_double, + complex_double, + real_half, + complex_half, + real_bfloat16, + complex_bfloat16, + real_int4, + complex_int4, + real_uint4, + complex_uint4, + real_int8, + complex_int8, + real_uint8, + complex_uint8, + real_int16, + complex_int16, + real_uint16, + complex_uint16, + real_int32, + complex_int32, + real_uint32, + complex_uint32, + real_int64, + complex_int64, + real_uint64, + complex_uint64, + real_int8_4, + real_int8_32, + real_uint8_4, + library_data_t_size + }; + + template <typename T> + struct DataType + { + using T2 = T; + }; + template <typename T> + struct DataType<sycl::vec<T, 2>> + { + using T2 = std::complex<T>; + }; + + static void destroy_event(event_ptr event) + { + delete event; + } + + static inline unsigned int get_tid() + { +#if defined(__linux__) + return syscall(SYS_gettid); +#elif defined(_WIN64) + return GetCurrentThreadId(); +#else +#error "Only support Windows and Linux." +#endif + } + + namespace detail + { + static void get_version(const sycl::device &dev, int &major, int &minor) + { + // Version string has the following format: + // a. OpenCL<space><major.minor><space><vendor-specific-information> + // b. <major.minor> + // c. <AmdGcnArchName> e.g gfx1030 + std::string ver; + ver = dev.get_info<sycl::info::device::version>(); + std::string::size_type i = 0; + while (i < ver.size()) { + if (isdigit(ver[i])) + break; + i++; + } + major = std::stoi(&(ver[i])); + while (i < ver.size()) { + if (ver[i] == '.') + break; + i++; + } + if (i < ver.size()) { + // a. and b. + i++; + minor = std::stoi(&(ver[i])); + } else { + // c. + minor = 0; + } + } + + template <typename tag, typename T> + class generic_error_type + { + public: + generic_error_type() = default; + generic_error_type(T value) : value{value} {} + operator T() const { return value; } + + private: + T value; + }; + + } // namespace detail + + /// Pitched 2D/3D memory data. + class pitched_data + { + public: + pitched_data() : pitched_data(nullptr, 0, 0, 0) {} + pitched_data(void *data, size_t pitch, size_t x, size_t y) + : _data(data), _pitch(pitch), _x(x), _y(y) {} + + void *get_data_ptr() { return _data; } + void set_data_ptr(void *data) { _data = data; } + + size_t get_pitch() { return _pitch; } + void set_pitch(size_t pitch) { _pitch = pitch; } + + size_t get_x() { return _x; } + void set_x(size_t x) { _x = x; } + + size_t get_y() { return _y; } + void set_y(size_t y) { _y = y; } + + private: + void *_data; + size_t _pitch, _x, _y; + }; + + class device_info + { + public: + // get interface + const char *get_name() const { return _name; } + char *get_name() { return _name; } + template <typename WorkItemSizesTy = sycl::range<3>, + std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> || + std::is_same_v<WorkItemSizesTy, int *>, + int> = 0> + auto get_max_work_item_sizes() const + { + if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>) + return sycl::range<3>(_max_work_item_sizes_i[0], + _max_work_item_sizes_i[1], + _max_work_item_sizes_i[2]); + else + { + return _max_work_item_sizes_i; + } + } + template <typename WorkItemSizesTy = sycl::range<3>, + std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> || + std::is_same_v<WorkItemSizesTy, int *>, + int> = 0> + auto get_max_work_item_sizes() + { + if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>) + return sycl::range<3>(_max_work_item_sizes_i[0], + _max_work_item_sizes_i[1], + _max_work_item_sizes_i[2]); + else + { + return _max_work_item_sizes_i; + } + } + bool get_host_unified_memory() const { return _host_unified_memory; } + int get_major_version() const { return _major; } + int get_minor_version() const { return _minor; } + int get_integrated() const { return _integrated; } + int get_max_clock_frequency() const { return _frequency; } + int get_max_compute_units() const { return _max_compute_units; } + int get_max_work_group_size() const { return _max_work_group_size; } + int get_max_sub_group_size() const { return _max_sub_group_size; } + int get_max_work_items_per_compute_unit() const + { + return _max_work_items_per_compute_unit; + } + int get_max_register_size_per_work_group() const + { + return _max_register_size_per_work_group; + } + template <typename NDRangeSizeTy = size_t *, + std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> || + std::is_same_v<NDRangeSizeTy, int *>, + int> = 0> + auto get_max_nd_range_size() const + { + if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>) + return _max_nd_range_size; + else + return _max_nd_range_size_i; + } + template <typename NDRangeSizeTy = size_t *, + std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> || + std::is_same_v<NDRangeSizeTy, int *>, + int> = 0> + auto get_max_nd_range_size() + { + if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>) + return _max_nd_range_size; + else + return _max_nd_range_size_i; + } + size_t get_global_mem_size() const { return _global_mem_size; } + size_t get_local_mem_size() const { return _local_mem_size; } + size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; } + /// Returns the maximum clock rate of device's global memory in kHz. If + /// compiler does not support this API then returns default value 3200000 kHz. + unsigned int get_memory_clock_rate() const { return _memory_clock_rate; } + /// Returns the maximum bus width between device and memory in bits. If + /// compiler does not support this API then returns default value 64 bits. + unsigned int get_memory_bus_width() const { return _memory_bus_width; } + uint32_t get_device_id() const { return _device_id; } + std::array<unsigned char, 16> get_uuid() const { return _uuid; } + /// Returns global memory cache size in bytes. + unsigned int get_global_mem_cache_size() const + { + return _global_mem_cache_size; + } + + // set interface + void set_name(const char *name) + { + size_t length = strlen(name); + if (length < 256) + { + std::memcpy(_name, name, length + 1); + } + else + { + std::memcpy(_name, name, 255); + _name[255] = '\0'; + } + } + void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) + { + for (int i = 0; i < 3; ++i) + _max_work_item_sizes_i[i] = max_work_item_sizes[i]; + } + [[deprecated]] void + set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) + { + for (int i = 0; i < 3; ++i) + { + _max_work_item_sizes_i[i] = max_work_item_sizes[i]; + } + } + void set_host_unified_memory(bool host_unified_memory) + { + _host_unified_memory = host_unified_memory; + } + void set_major_version(int major) { _major = major; } + void set_minor_version(int minor) { _minor = minor; } + void set_integrated(int integrated) { _integrated = integrated; } + void set_max_clock_frequency(int frequency) { _frequency = frequency; } + void set_max_compute_units(int max_compute_units) + { + _max_compute_units = max_compute_units; + } + void set_global_mem_size(size_t global_mem_size) + { + _global_mem_size = global_mem_size; + } + void set_local_mem_size(size_t local_mem_size) + { + _local_mem_size = local_mem_size; + } + void set_max_mem_alloc_size(size_t max_mem_alloc_size) + { + _max_mem_alloc_size = max_mem_alloc_size; + } + void set_max_work_group_size(int max_work_group_size) + { + _max_work_group_size = max_work_group_size; + } + void set_max_sub_group_size(int max_sub_group_size) + { + _max_sub_group_size = max_sub_group_size; + } + void + set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) + { + _max_work_items_per_compute_unit = max_work_items_per_compute_unit; + } + void set_max_nd_range_size(int max_nd_range_size[]) + { + for (int i = 0; i < 3; i++) + { + _max_nd_range_size[i] = max_nd_range_size[i]; + _max_nd_range_size_i[i] = max_nd_range_size[i]; + } + } + void set_memory_clock_rate(unsigned int memory_clock_rate) + { + _memory_clock_rate = memory_clock_rate; + } + void set_memory_bus_width(unsigned int memory_bus_width) + { + _memory_bus_width = memory_bus_width; + } + void + set_max_register_size_per_work_group(int max_register_size_per_work_group) + { + _max_register_size_per_work_group = max_register_size_per_work_group; + } + void set_device_id(uint32_t device_id) + { + _device_id = device_id; + } + void set_uuid(std::array<unsigned char, 16> uuid) + { + _uuid = std::move(uuid); + } + void set_global_mem_cache_size(unsigned int global_mem_cache_size) + { + _global_mem_cache_size = global_mem_cache_size; + } + + private: + char _name[256]; + int _max_work_item_sizes_i[3]; + bool _host_unified_memory = false; + int _major; + int _minor; + int _integrated = 0; + int _frequency; + // Set estimated value 3200000 kHz as default value. + unsigned int _memory_clock_rate = 3200000; + // Set estimated value 64 bits as default value. + unsigned int _memory_bus_width = 64; + unsigned int _global_mem_cache_size; + int _max_compute_units; + int _max_work_group_size; + int _max_sub_group_size; + int _max_work_items_per_compute_unit; + int _max_register_size_per_work_group; + size_t _global_mem_size; + size_t _local_mem_size; + size_t _max_mem_alloc_size; + size_t _max_nd_range_size[3]; + int _max_nd_range_size_i[3]; + uint32_t _device_id; + std::array<unsigned char, 16> _uuid; + }; + + static int get_major_version(const sycl::device &dev) + { + int major, minor; + detail::get_version(dev, major, minor); + return major; + } + + static int get_minor_version(const sycl::device &dev) + { + int major, minor; + detail::get_version(dev, major, minor); + return minor; + } + + static void get_device_info(device_info &out, const sycl::device &dev) + { + device_info prop; + prop.set_name(dev.get_info<sycl::info::device::name>().c_str()); + + int major, minor; + detail::get_version(dev, major, minor); + prop.set_major_version(major); + prop.set_minor_version(minor); + + prop.set_max_work_item_sizes( +#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902) + // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes + // is an enum class element + dev.get_info<sycl::info::device::max_work_item_sizes>()); +#else + // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by + // an int + dev.get_info<sycl::info::device::max_work_item_sizes<3>>()); +#endif + prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations)); + + prop.set_max_clock_frequency( + dev.get_info<sycl::info::device::max_clock_frequency>() * 1000); + + prop.set_max_compute_units( + dev.get_info<sycl::info::device::max_compute_units>()); + prop.set_max_work_group_size( + dev.get_info<sycl::info::device::max_work_group_size>()); + prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>()); + prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>()); + prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>()); + +#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6) + if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) + { + unsigned int tmp = + dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>(); + if (tmp != 0) + prop.set_memory_clock_rate(1000 * tmp); + } + if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) + { + prop.set_memory_bus_width( + dev.get_info<sycl::ext::intel::info::device::memory_bus_width>()); + } + if (dev.has(sycl::aspect::ext_intel_device_id)) + { + prop.set_device_id( + dev.get_info<sycl::ext::intel::info::device::device_id>()); + } + if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) + { + prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>()); + } +#elif defined(_MSC_VER) && !defined(__clang__) +#pragma message("get_device_info: querying memory_clock_rate and \ + memory_bus_width are not supported by the compiler used. \ + Use 3200000 kHz as memory_clock_rate default value. \ + Use 64 bits as memory_bus_width default value.") +#else +#warning "get_device_info: querying memory_clock_rate and \ + memory_bus_width are not supported by the compiler used. \ + Use 3200000 kHz as memory_clock_rate default value. \ + Use 64 bits as memory_bus_width default value." +#endif + + size_t max_sub_group_size = 1; + std::vector<size_t> sub_group_sizes = + dev.get_info<sycl::info::device::sub_group_sizes>(); + + for (const auto &sub_group_size : sub_group_sizes) + { + if (max_sub_group_size < sub_group_size) + max_sub_group_size = sub_group_size; + } + + prop.set_max_sub_group_size(max_sub_group_size); + + prop.set_max_work_items_per_compute_unit( + dev.get_info<sycl::info::device::max_work_group_size>()); + int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + prop.set_max_nd_range_size(max_nd_range_size); + + // Estimates max register size per work group, feel free to update the value + // according to device properties. + prop.set_max_register_size_per_work_group(65536); + + prop.set_global_mem_cache_size( + dev.get_info<sycl::info::device::global_mem_cache_size>()); + out = prop; + } + + /// dpct device extension + class device_ext : public sycl::device { + typedef std::mutex mutex_type; + + public: + device_ext() : sycl::device() {} + ~device_ext() { + std::lock_guard<mutex_type> lock(m_mutex); + clear_queues(); + } + device_ext(const sycl::device &base) : sycl::device(base) { + std::lock_guard<mutex_type> lock(m_mutex); + init_queues(); + } + + int is_native_atomic_supported() { return 0; } + int get_major_version() const { return dpct::get_major_version(*this); } + + int get_minor_version() const { return dpct::get_minor_version(*this); } + + int get_max_compute_units() const { + return get_device_info().get_max_compute_units(); + } + + /// Return the maximum clock frequency of this device in KHz. + int get_max_clock_frequency() const { + return get_device_info().get_max_clock_frequency(); + } + + int get_integrated() const { return get_device_info().get_integrated(); } + + int get_max_sub_group_size() const { + return get_device_info().get_max_sub_group_size(); + } + + int get_max_register_size_per_work_group() const { + return get_device_info().get_max_register_size_per_work_group(); + } + + int get_max_work_group_size() const { + return get_device_info().get_max_work_group_size(); + } + + int get_mem_base_addr_align() const { + return get_info<sycl::info::device::mem_base_addr_align>(); + } + + size_t get_global_mem_size() const { + return get_device_info().get_global_mem_size(); + } + + size_t get_max_mem_alloc_size() const { + return get_device_info().get_max_mem_alloc_size(); + } + + /// Get the number of bytes of free and total memory on the SYCL device. + /// \param [out] free_memory The number of bytes of free memory on the + /// SYCL device. \param [out] total_memory The number of bytes of total + /// memory on the SYCL device. + void get_memory_info(size_t &free_memory, size_t &total_memory) { + total_memory = get_device_info().get_global_mem_size(); + const char *warning_info = + "get_memory_info: [warning] ext_intel_free_memory is not " + "supported (export/set ZES_ENABLE_SYSMAN=1 to support), " + "use total memory as free memory"; +#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) + if (!has(sycl::aspect::ext_intel_free_memory)) { + std::cerr << warning_info << std::endl; + free_memory = total_memory; + } else { + free_memory = get_info<sycl::ext::intel::info::device::free_memory>(); + } +#else + std::cerr << warning_info << std::endl; + free_memory = total_memory; +#if defined(_MSC_VER) && !defined(__clang__) +#pragma message("Querying the number of bytes of free memory is not supported") +#else +#warning "Querying the number of bytes of free memory is not supported" +#endif +#endif + } + + void get_device_info(device_info &out) const { + dpct::get_device_info(out, *this); + } + + device_info get_device_info() const { + device_info prop; + dpct::get_device_info(prop, *this); + return prop; + } + + void reset() { + std::lock_guard<mutex_type> lock(m_mutex); + clear_queues(); + init_queues(); + } + + sycl::queue &in_order_queue() { return _q_in_order; } + + sycl::queue &out_of_order_queue() { return _q_out_of_order; } + + sycl::queue &default_queue() { return in_order_queue(); } + + void queues_wait_and_throw() { + std::unique_lock<mutex_type> lock(m_mutex); + lock.unlock(); + for (auto &q : _queues) { + q.wait_and_throw(); + } + // Guard the destruct of current_queues to make sure the ref count is + // safe. + lock.lock(); + } + + sycl::queue create_queue(bool enable_exception_handler = false) { + return create_in_order_queue(enable_exception_handler); + } + + sycl::queue create_queue(sycl::device device, + bool enable_exception_handler = false) { + return create_in_order_queue(device, enable_exception_handler); + } + + sycl::queue create_in_order_queue(bool enable_exception_handler = false) { + std::lock_guard<mutex_type> lock(m_mutex); + return create_queue_impl(enable_exception_handler, + sycl::property::queue::in_order()); + } + + sycl::queue create_in_order_queue(sycl::device device, + bool enable_exception_handler = false) { + std::lock_guard<mutex_type> lock(m_mutex); + return create_queue_impl(device, enable_exception_handler, + sycl::property::queue::in_order()); + } + + sycl::queue create_out_of_order_queue( + bool enable_exception_handler = false) { + std::lock_guard<mutex_type> lock(m_mutex); + return create_queue_impl(enable_exception_handler); + } + + void destroy_queue(sycl::queue queue) { + std::lock_guard<mutex_type> lock(m_mutex); + _queues.erase(std::remove_if(_queues.begin(), _queues.end(), + [=](const sycl::queue &q) -> bool + { + return q == queue; + }), + _queues.end()); + } + void set_saved_queue(sycl::queue q) { + std::lock_guard<mutex_type> lock(m_mutex); + _saved_queue = q; + } + sycl::queue get_saved_queue() const { + std::lock_guard<mutex_type> lock(m_mutex); + return _saved_queue; + } + + private: + void clear_queues() { _queues.clear(); } + + void init_queues() { + _q_in_order = + create_queue_impl(true, sycl::property::queue::in_order()); + _q_out_of_order = create_queue_impl(true); + _saved_queue = default_queue(); + } + + /// Caller should acquire resource \p m_mutex before calling this + /// function. + template <class... Properties> + sycl::queue create_queue_impl(bool enable_exception_handler, + Properties... properties) { + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; + } + _queues.push_back(sycl::queue( + *this, eh, + sycl::property_list( +#ifdef DPCT_PROFILING_ENABLED + sycl::property::queue::enable_profiling(), +#endif + properties...))); + + return _queues.back(); + } + + template <class... Properties> + sycl::queue create_queue_impl(sycl::device device, + bool enable_exception_handler, + Properties... properties) { + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; + } + _queues.push_back(sycl::queue( + device, eh, + sycl::property_list( +#ifdef DPCT_PROFILING_ENABLED + sycl::property::queue::enable_profiling(), +#endif + properties...))); + + return _queues.back(); + } + + void get_version(int &major, int &minor) const { + detail::get_version(*this, major, minor); + } + sycl::queue _q_in_order, _q_out_of_order; + sycl::queue _saved_queue; + std::vector<sycl::queue> _queues; + mutable mutex_type m_mutex; + }; + + + /// device manager + class dev_mgr + { + public: + device_ext ¤t_device() + { + unsigned int dev_id = current_device_id(); + check_id(dev_id); + return *_devs[dev_id]; + } + device_ext &cpu_device() const + { + std::lock_guard<std::recursive_mutex> lock(m_mutex); + if (_cpu_device == -1) + { + throw std::runtime_error("no valid cpu device"); + } + else + { + return *_devs[_cpu_device]; + } + } + device_ext &get_device(unsigned int id) const + { + std::lock_guard<std::recursive_mutex> lock(m_mutex); + check_id(id); + return *_devs[id]; + } + unsigned int current_device_id() const + { + std::lock_guard<std::recursive_mutex> lock(m_mutex); + auto it = _thread2dev_map.find(get_tid()); + if (it != _thread2dev_map.end()) + return it->second; + return DEFAULT_DEVICE_ID; + } + + /// Select device with a device ID. + /// \param [in] id The id of the device which can + /// be obtained through get_device_id(const sycl::device). + void select_device(unsigned int id) + { + std::lock_guard<std::recursive_mutex> lock(m_mutex); + check_id(id); + _thread2dev_map[get_tid()] = id; + } + unsigned int device_count() { return _devs.size(); } + + unsigned int get_device_id(const sycl::device &dev) + { + unsigned int id = 0; + for (auto &dev_item : _devs) + { + if (*dev_item == dev) + { + return id; + } + id++; + } + return -1; + } + + inline std::string get_preferred_gpu_platform_name() { + std::string result; + + std::string filter = "level-zero"; + char* env = getenv("ONEAPI_DEVICE_SELECTOR"); + if (env) { + if (std::strstr(env, "level_zero")) { + filter = "level-zero"; + } + else if (std::strstr(env, "opencl")) { + filter = "opencl"; + } + else if (std::strstr(env, "cuda")) { + filter = "cuda"; + } + else if (std::strstr(env, "hip")) { + filter = "hip"; + } + else { + throw std::runtime_error("invalid device filter: " + std::string(env)); + } + } + + auto plaform_list = sycl::platform::get_platforms(); + + for (const auto& platform : plaform_list) { + auto devices = platform.get_devices(); + auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) { + return d.is_gpu(); + }); + + if (gpu_dev == devices.end()) { + // cout << "platform [" << platform_name + // << "] does not contain GPU devices, skipping\n"; + continue; + } + + auto platform_name = platform.get_info<sycl::info::platform::name>(); + std::string platform_name_low_case; + platform_name_low_case.resize(platform_name.size()); + + std::transform( + platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower); + + if (platform_name_low_case.find(filter) == std::string::npos) { + // cout << "platform [" << platform_name + // << "] does not match with requested " + // << filter << ", skipping\n"; + continue; + } + + result = platform_name; + } + + if (result.empty()) + throw std::runtime_error("can not find preferred GPU platform"); + + return result; + } + + template <class DeviceSelector> + std::enable_if_t< + std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>> + select_device(const DeviceSelector &selector = sycl::gpu_selector_v) + { + sycl::device selected_device = sycl::device(selector); + unsigned int selected_device_id = get_device_id(selected_device); + select_device(selected_device_id); + } + + /// Returns the instance of device manager singleton. + static dev_mgr &instance() + { + static dev_mgr d_m; + return d_m; + } + dev_mgr(const dev_mgr &) = delete; + dev_mgr &operator=(const dev_mgr &) = delete; + dev_mgr(dev_mgr &&) = delete; + dev_mgr &operator=(dev_mgr &&) = delete; + + private: + mutable std::recursive_mutex m_mutex; + static bool compare_dev(sycl::device &device1, sycl::device &device2) + { + sycl::backend backend1 = device1.get_backend(); + sycl::backend backend2 = device2.get_backend(); + // levelzero backends always come first + if(backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true; + if(backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false; + dpct::device_info prop1; + dpct::get_device_info(prop1, device1); + dpct::device_info prop2; + dpct::get_device_info(prop2, device2); + return prop1.get_max_compute_units() > prop2.get_max_compute_units(); + } + static int convert_backend_index(std::string & backend) { + if (backend == "ext_oneapi_level_zero:gpu") return 0; + if (backend == "opencl:gpu") return 1; + if (backend == "ext_oneapi_cuda:gpu") return 2; + if (backend == "ext_oneapi_hip:gpu") return 3; + if (backend == "opencl:cpu") return 4; + if (backend == "opencl:acc") return 5; + printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); + GGML_ASSERT(false); + } + static bool compare_backend(std::string &backend1, std::string &backend2) { + return convert_backend_index(backend1) < convert_backend_index(backend2); + } + dev_mgr() + { + sycl::device default_device = + sycl::device(sycl::default_selector_v); + _devs.push_back(std::make_shared<device_ext>(default_device)); + + std::vector<sycl::device> sycl_all_devs; + // Collect other devices except for the default device. + if (default_device.is_cpu()) + _cpu_device = 0; + + auto Platforms = sycl::platform::get_platforms(); + // Keep track of the number of devices per backend + std::map<sycl::backend, size_t> DeviceNums; + std::map<std::string, std::vector<sycl::device>> backend_devices; + auto preferred_platform_name = get_preferred_gpu_platform_name(); + + while (!Platforms.empty()) { + auto Platform = Platforms.back(); + Platforms.pop_back(); + auto platform_name = Platform.get_info<sycl::info::platform::name>(); + if (platform_name.compare(preferred_platform_name) != 0) { + continue; + } + auto devices = Platform.get_devices(); + std::string backend_type = get_device_backend_and_type(devices[0]); + for (const auto &device : devices) { + backend_devices[backend_type].push_back(device); + } + } + + std::vector<std::string> keys; + for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) { + keys.push_back(it->first); + } + std::sort(keys.begin(), keys.end(), compare_backend); + + for (auto &key : keys) { + std::vector<sycl::device> devs = backend_devices[key]; + std::sort(devs.begin(), devs.end(), compare_dev); + for (const auto &dev : devs) { + sycl_all_devs.push_back(dev); + } + } + + for (auto &dev : sycl_all_devs) + { + if (dev == default_device) + { + continue; + } + _devs.push_back(std::make_shared<device_ext>(dev)); + if (_cpu_device == -1 && dev.is_cpu()) + { + _cpu_device = _devs.size() - 1; + } + } + } + void check_id(unsigned int id) const + { + if (id >= _devs.size()) + { + throw std::runtime_error("invalid device id"); + } + } + std::vector<std::shared_ptr<device_ext>> _devs; + /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current + /// thread id in _thread2dev_map, which means default device should be used + /// for the current thread. + const unsigned int DEFAULT_DEVICE_ID = 0; + /// thread-id to device-id map. + std::map<unsigned int, unsigned int> _thread2dev_map; + int _cpu_device = -1; + }; + + static inline sycl::queue &get_default_queue() + { + return dev_mgr::instance().current_device().default_queue(); + } + + namespace detail + { + enum class pointer_access_attribute + { + host_only = 0, + device_only, + host_device, + end + }; + + static pointer_access_attribute get_pointer_attribute(sycl::queue &q, + const void *ptr) + { + switch (sycl::get_pointer_type(ptr, q.get_context())) + { + case sycl::usm::alloc::unknown: + return pointer_access_attribute::host_only; + case sycl::usm::alloc::device: + return pointer_access_attribute::device_only; + case sycl::usm::alloc::shared: + case sycl::usm::alloc::host: + return pointer_access_attribute::host_device; + } + } + + template <typename ArgT> + inline constexpr std::uint64_t get_type_combination_id(ArgT Val) + { + static_assert((unsigned char)library_data_t::library_data_t_size <= + std::numeric_limits<unsigned char>::max() && + "library_data_t size exceeds limit."); + static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT"); + return (std::uint64_t)Val; + } + + template <typename FirstT, typename... RestT> + inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal, + RestT... RestVal) + { + static_assert((std::uint8_t)library_data_t::library_data_t_size <= + std::numeric_limits<unsigned char>::max() && + "library_data_t size exceeds limit."); + static_assert(sizeof...(RestT) <= 8 && "Too many parameters"); + static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT"); + return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal); + } + + class mem_mgr + { + mem_mgr() + { + // Reserved address space, no real memory allocation happens here. +#if defined(__linux__) + mapped_address_space = + (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#elif defined(_WIN64) + mapped_address_space = (byte_t *)VirtualAlloc( + NULL, // NULL specified as the base address parameter + mapped_region_size, // Size of allocation + MEM_RESERVE, // Allocate reserved pages + PAGE_NOACCESS); // Protection = no access +#else +#error "Only support Windows and Linux." +#endif + next_free = mapped_address_space; + } + + public: + using buffer_id_t = int; + + struct allocation + { + buffer_t buffer; + byte_t *alloc_ptr; + size_t size; + }; + + ~mem_mgr() + { +#if defined(__linux__) + munmap(mapped_address_space, mapped_region_size); +#elif defined(_WIN64) + VirtualFree(mapped_address_space, 0, MEM_RELEASE); +#else +#error "Only support Windows and Linux." +#endif + } + + mem_mgr(const mem_mgr &) = delete; + mem_mgr &operator=(const mem_mgr &) = delete; + mem_mgr(mem_mgr &&) = delete; + mem_mgr &operator=(mem_mgr &&) = delete; + + /// Allocate + void *mem_alloc(size_t size) + { + if (!size) + return nullptr; + std::lock_guard<std::mutex> lock(m_mutex); + if (next_free + size > mapped_address_space + mapped_region_size) + { + throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool"); + } + // Allocation + sycl::range<1> r(size); + buffer_t buf(r); + allocation A{buf, next_free, size}; + // Map allocation to device pointer + void *result = next_free; + m_map.emplace(next_free + size, A); + // Update pointer to the next free space. + next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1); + + return result; + } + + /// Deallocate + void mem_free(const void *ptr) + { + if (!ptr) + return; + std::lock_guard<std::mutex> lock(m_mutex); + auto it = get_map_iterator(ptr); + m_map.erase(it); + } + + /// map: device pointer -> allocation(buffer, alloc_ptr, size) + allocation translate_ptr(const void *ptr) + { + std::lock_guard<std::mutex> lock(m_mutex); + auto it = get_map_iterator(ptr); + return it->second; + } + + /// Check if the pointer represents device pointer or not. + bool is_device_ptr(const void *ptr) const + { + std::lock_guard<std::mutex> lock(m_mutex); + return (mapped_address_space <= ptr) && + (ptr < mapped_address_space + mapped_region_size); + } + + /// Returns the instance of memory manager singleton. + static mem_mgr &instance() + { + static mem_mgr m; + return m; + } + + private: + std::map<byte_t *, allocation> m_map; + mutable std::mutex m_mutex; + byte_t *mapped_address_space; + byte_t *next_free; + const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024; + const size_t alignment = 256; + /// This padding may be defined to some positive value to debug + /// out of bound accesses. + const size_t extra_padding = 0; + + std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr) + { + auto it = m_map.upper_bound((byte_t *)ptr); + if (it == m_map.end()) + { + // Not a virtual pointer. + throw std::runtime_error("can not get buffer from non-virtual pointer"); + } + const allocation &alloc = it->second; + if (ptr < alloc.alloc_ptr) + { + // Out of bound. + // This may happen if there's a gap between allocations due to alignment + // or extra padding and pointer points to this gap. + throw std::runtime_error("invalid virtual pointer"); + } + return it; + } + }; + + template <class T, memory_region Memory, size_t Dimension> + class accessor; + template <memory_region Memory, class T = byte_t> + class memory_traits + { + public: + static constexpr sycl::access::target target = + sycl::access::target::device; + static constexpr sycl::access_mode mode = + (Memory == constant) ? sycl::access_mode::read + : sycl::access_mode::read_write; + static constexpr size_t type_size = sizeof(T); + using element_t = + typename std::conditional<Memory == constant, const T, T>::type; + using value_t = typename std::remove_cv<T>::type; + template <size_t Dimension = 1> + using accessor_t = typename std::conditional< + Memory == local, sycl::local_accessor<value_t, Dimension>, + sycl::accessor<T, Dimension, mode, target>>::type; + using pointer_t = T *; + }; + + static inline void *dpct_malloc(size_t size, sycl::queue &q) + { + return sycl::malloc_device(size, q.get_device(), q.get_context()); + } + +#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F)) + static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z, + sycl::queue &q) + { + pitch = PITCH_DEFAULT_ALIGN(x); + return dpct_malloc(pitch * y * z, q); + } + + /** + * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q. + * @tparam valueT The type of the element to be set. + * @param [in] q The queue in which the operation is done. + * @param [in] dev_ptr Pointer to the virtual device memory address. + * @param [in] value The value to be set. + * @param [in] size Number of elements to be set to the value. + * @return An event representing the memset operation. + */ + template <typename valueT> + static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr, + valueT value, size_t size) + { + return q.fill(dev_ptr, value, size); + } + + /** + * @brief Sets \p value to the 3D memory region pointed by \p data in \p q. + * @tparam valueT The type of the element to be set. + * @param [in] q The queue in which the operation is done. + * @param [in] data Pointer to the pitched device memory region. + * @param [in] value The value to be set. + * @param [in] size 3D memory region by number of elements. + * @return An event list representing the memset operations. + */ + template <typename valueT> + static inline std::vector<sycl::event> + dpct_memset(sycl::queue &q, pitched_data data, valueT value, + sycl::range<3> size) + { + std::vector<sycl::event> event_list; + size_t slice = data.get_pitch() * data.get_y(); + unsigned char *data_surface = (unsigned char *)data.get_data_ptr(); + for (size_t z = 0; z < size.get(2); ++z) + { + unsigned char *data_ptr = data_surface; + for (size_t y = 0; y < size.get(1); ++y) + { + event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0))); + data_ptr += data.get_pitch(); + } + data_surface += slice; + } + return event_list; + } + + /** + * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q. + * @tparam valueT The type of the element to be set. + * @param [in] q The queue in which the operation is done. + * @param [in] ptr Pointer to the virtual device memory. + * @param [in] pitch The pitch size by number of elements, including padding. + * @param [in] val The value to be set. + * @param [in] x The width of memory region by number of elements. + * @param [in] y The height of memory region by number of elements. + * @return An event list representing the memset operations. + */ + template <typename valueT> + static inline std::vector<sycl::event> + dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x, + size_t y) + { + return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val, + sycl::range<3>(x, y, 1)); + } + + static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr, + const void *from_ptr, + memcpy_direction dir) + { + switch (dir) + { + case memcpy_direction::host_to_host: + case memcpy_direction::host_to_device: + case memcpy_direction::device_to_host: + case memcpy_direction::device_to_device: + return dir; + case memcpy_direction::automatic: + { + // table[to_attribute][from_attribute] + static const memcpy_direction + direction_table[static_cast<unsigned>(pointer_access_attribute::end)] + [static_cast<unsigned>(pointer_access_attribute::end)] = + {{memcpy_direction::host_to_host, + memcpy_direction::device_to_host, + memcpy_direction::host_to_host}, + {memcpy_direction::host_to_device, + memcpy_direction::device_to_device, + memcpy_direction::device_to_device}, + {memcpy_direction::host_to_host, + memcpy_direction::device_to_device, + memcpy_direction::device_to_device}}; + return direction_table[static_cast<unsigned>(get_pointer_attribute( + q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))]; + } + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } + } + + static sycl::event + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction, + const std::vector<sycl::event> &dep_events = {}) + { + if (!size) + return sycl::event{}; + return q.memcpy(to_ptr, from_ptr, size, dep_events); + GGML_UNUSED(direction); + } + + // Get actual copy range and make sure it will not exceed range. + static inline size_t get_copy_range(sycl::range<3> size, size_t slice, + size_t pitch) + { + return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); + } + + static inline size_t get_offset(sycl::id<3> id, size_t slice, + size_t pitch) + { + return slice * id.get(2) + pitch * id.get(1) + id.get(0); + } + + /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr + /// and \p from_range to another specified by \p to_ptr and \p to_range. + static inline std::vector<sycl::event> + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + sycl::range<3> to_range, sycl::range<3> from_range, + sycl::id<3> to_id, sycl::id<3> from_id, + sycl::range<3> size, memcpy_direction direction, + const std::vector<sycl::event> &dep_events = {}) + { + // RAII for host pointer + class host_buffer + { + void *_buf; + size_t _size; + sycl::queue &_q; + const std::vector<sycl::event> &_deps; // free operation depends + + public: + host_buffer(size_t size, sycl::queue &q, + const std::vector<sycl::event> &deps) + : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} + void *get_ptr() const { return _buf; } + size_t get_size() const { return _size; } + ~host_buffer() + { + if (_buf) + { + _q.submit([&](sycl::handler &cgh) + { + cgh.depends_on(_deps); + cgh.host_task([buf = _buf] { std::free(buf); }); }); + } + } + }; + std::vector<sycl::event> event_list; + + size_t to_slice = to_range.get(1) * to_range.get(0), + from_slice = from_range.get(1) * from_range.get(0); + unsigned char *to_surface = + (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); + const unsigned char *from_surface = + (const unsigned char *)from_ptr + + get_offset(from_id, from_slice, from_range.get(0)); + + if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) + { + return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), + direction, dep_events)}; + } + direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); + size_t size_slice = size.get(1) * size.get(0); + switch (direction) + { + case host_to_host: + for (size_t z = 0; z < size.get(2); ++z) + { + unsigned char *to_ptr = to_surface; + const unsigned char *from_ptr = from_surface; + if (to_range.get(0) == from_range.get(0) && + to_range.get(0) == size.get(0)) + { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, + direction, dep_events)); + } + else + { + for (size_t y = 0; y < size.get(1); ++y) + { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), + direction, dep_events)); + to_ptr += to_range.get(0); + from_ptr += from_range.get(0); + } + } + to_surface += to_slice; + from_surface += from_slice; + } + break; + case host_to_device: + { + host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, + event_list); + std::vector<sycl::event> host_events; + if (to_slice == size_slice) + { + // Copy host data to a temp host buffer with the shape of target. + host_events = + dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, + host_to_host, dep_events); + } + else + { + // Copy host data to a temp host buffer with the shape of target. + host_events = dpct_memcpy( + q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, + // If has padding data, not sure whether it is useless. So fill temp + // buffer with it. + std::vector<sycl::event>{ + dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), + device_to_host, dep_events)}); + } + // Copy from temp host buffer to device with only one submit. + event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), + buf.get_size(), host_to_device, + host_events)); + break; + } + case device_to_host: + { + host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, + event_list); + // Copy from host temp buffer to host target with reshaping. + event_list = dpct_memcpy( + q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), + sycl::id<3>(0, 0, 0), size, host_to_host, + // Copy from device to temp host buffer with only one submit. + std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface, + buf.get_size(), + device_to_host, dep_events)}); + break; + } + case device_to_device: + event_list.push_back(q.submit([&](sycl::handler &cgh){ + cgh.depends_on(dep_events); + cgh.parallel_for<class dpct_memcpy_3d_detail>( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); })); + break; + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } + return event_list; + } + + /// memcpy 2D/3D matrix specified by pitched_data. + static inline std::vector<sycl::event> + dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, + pitched_data from, sycl::id<3> from_id, sycl::range<3> size, + memcpy_direction direction = automatic) + { + return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), + sycl::range<3>(to.get_pitch(), to.get_y(), 1), + sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, + size, direction); + } + + /// memcpy 2D matrix with pitch. + static inline std::vector<sycl::event> + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + size_t to_pitch, size_t from_pitch, size_t x, size_t y, + memcpy_direction direction = automatic) + { + return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), + sycl::range<3>(from_pitch, y, 1), + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), + sycl::range<3>(x, y, 1), direction); + } + + namespace deprecated + { + + template <typename T, sycl::usm::alloc AllocKind> + class usm_allocator + { + private: + using Alloc = sycl::usm_allocator<T, AllocKind>; + Alloc _impl; + + public: + using value_type = typename std::allocator_traits<Alloc>::value_type; + using pointer = typename std::allocator_traits<Alloc>::pointer; + using const_pointer = typename std::allocator_traits<Alloc>::const_pointer; + using void_pointer = typename std::allocator_traits<Alloc>::void_pointer; + using const_void_pointer = + typename std::allocator_traits<Alloc>::const_void_pointer; + using reference = typename std::allocator_traits<Alloc>::value_type &; + using const_reference = + const typename std::allocator_traits<Alloc>::value_type &; + using difference_type = + typename std::allocator_traits<Alloc>::difference_type; + using size_type = typename std::allocator_traits<Alloc>::size_type; + using propagate_on_container_copy_assignment = typename std::allocator_traits< + Alloc>::propagate_on_container_copy_assignment; + using propagate_on_container_move_assignment = typename std::allocator_traits< + Alloc>::propagate_on_container_move_assignment; + using propagate_on_container_swap = + typename std::allocator_traits<Alloc>::propagate_on_container_swap; + using is_always_equal = + typename std::allocator_traits<Alloc>::is_always_equal; + + template <typename U> + struct rebind + { + typedef usm_allocator<U, AllocKind> other; + }; + + usm_allocator() : _impl(dpct::get_default_queue()) {} + ~usm_allocator() {} + usm_allocator(const usm_allocator &other) : _impl(other._impl) {} + usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {} + pointer address(reference r) { return &r; } + const_pointer address(const_reference r) { return &r; } + pointer allocate(size_type cnt, const_void_pointer hint = nullptr) + { + return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint); + } + void deallocate(pointer p, size_type cnt) + { + std::allocator_traits<Alloc>::deallocate(_impl, p, cnt); + } + size_type max_size() const + { + return std::allocator_traits<Alloc>::max_size(_impl); + } + bool operator==(const usm_allocator &other) const { return _impl == other._impl; } + bool operator!=(const usm_allocator &other) const { return _impl != other._impl; } + }; + + } // namespace deprecated + + inline void dpct_free(void *ptr, + const sycl::queue &q) + { + if (ptr) + { + sycl::free(ptr, q.get_context()); + } + } + + template <typename T> + inline auto get_memory(const void *x) + { + T *new_x = reinterpret_cast<T *>(const_cast<void *>(x)); + return new_x; + } + + template <typename T> + inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q) + { + using Ty = typename DataType<T>::T2; + Ty s_h; + if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only) + detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host) + .wait(); + else + s_h = *reinterpret_cast<const Ty *>(s); + return s_h; + } + + } // namespace detail + + template <typename T> + inline auto get_value(const T *s, sycl::queue &q) + { + return detail::get_value(s, q); + } + + namespace detail + { + template <class Ta, class Tb, class Tc, class Ts> + inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void *a, int lda, const void *b, + int ldb, const void *beta, void *c, int ldc) + { + Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); + auto data_a = get_memory<const Ta>(a); + auto data_b = get_memory<const Tb>(b); + auto data_c = get_memory<Tc>(c); + oneapi::mkl::blas::column_major::gemm( + q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, + data_b, ldb, beta_value, data_c, ldc); + } + + template <typename VecT, class BinaryOperation, class = void> + class vectorized_binary + { + public: + inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) + { + VecT v4; + for (size_t i = 0; i < v4.size(); ++i) + { + v4[i] = binary_op(a[i], b[i]); + } + return v4; + } + }; + + template <typename VecT, class BinaryOperation> + class vectorized_binary< + VecT, BinaryOperation, + std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>> + { + public: + inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) + { + return binary_op(a, b).template as<VecT>(); + } + }; + + template <class Ta, class Tb, class Tc, class Ts> + inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void **a, int lda, + const void **b, int ldb, const void *beta, void **c, + int ldc, int batch_size) + { + struct matrix_info_t + { + oneapi::mkl::transpose transpose_info[2]; + Ts value_info[2]; + std::int64_t size_info[3]; + std::int64_t ld_info[3]; + std::int64_t groupsize_info; + }; + + Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); + + matrix_info_t *matrix_info = + (matrix_info_t *)std::malloc(sizeof(matrix_info_t)); + matrix_info->transpose_info[0] = a_trans; + matrix_info->transpose_info[1] = b_trans; + matrix_info->value_info[0] = alpha_value; + matrix_info->value_info[1] = beta_value; + matrix_info->size_info[0] = m; + matrix_info->size_info[1] = n; + matrix_info->size_info[2] = k; + matrix_info->ld_info[0] = lda; + matrix_info->ld_info[1] = ldb; + matrix_info->ld_info[2] = ldc; + matrix_info->groupsize_info = batch_size; + + sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( + q, matrix_info->transpose_info, matrix_info->transpose_info + 1, + matrix_info->size_info, matrix_info->size_info + 1, + matrix_info->size_info + 2, matrix_info->value_info, + reinterpret_cast<const Ta **>(a), matrix_info->ld_info, + reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1, + matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), + matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); + + q.submit([&](sycl::handler &cgh) + { + cgh.depends_on(e); + cgh.host_task([=] { std::free(matrix_info); }); }); + } + + template <class Ta, class Tb, class Tc, class Ts> + inline void + gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, + int k, const void *alpha, const void *a, int lda, + long long int stride_a, const void *b, int ldb, + long long int stride_b, const void *beta, void *c, + int ldc, long long int stride_c, int batch_size) + { + Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); + auto data_a = get_memory<const Ta>(a); + auto data_b = get_memory<const Tb>(b); + auto data_c = get_memory<Tc>(c); + oneapi::mkl::blas::column_major::gemm_batch( + q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, + stride_a, data_b, ldb, stride_b, beta_value, + data_c, ldc, stride_c, batch_size); + } + + } // namespace detail + + template <typename VecT, class BinaryOperation> + inline unsigned vectorized_binary(unsigned a, unsigned b, + const BinaryOperation binary_op) + { + sycl::vec<unsigned, 1> v0{a}, v1{b}; + auto v2 = v0.as<VecT>(); + auto v3 = v1.as<VecT>(); + auto v4 = + detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op); + v0 = v4.template as<sycl::vec<unsigned, 1>>(); + return v0; + } + + static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction = automatic, + sycl::queue &q = dpct::get_default_queue()) + { + detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction); + } + + static inline unsigned int select_device(unsigned int id) + { + dev_mgr::instance().select_device(id); + return id; + } + + template <typename T> + T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, + unsigned int logical_sub_group_size = 32) + { + unsigned int id = g.get_local_linear_id(); + unsigned int start_index = + id / logical_sub_group_size * logical_sub_group_size; + unsigned int target_offset = (id % logical_sub_group_size) ^ mask; + return sycl::select_from_group(g, x, + target_offset < logical_sub_group_size + ? start_index + target_offset + : id); + } + + template <typename T> + sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) + { + return sycl::vec<T, 1>(val) + .template as<sycl::vec< + std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>() + .template convert<T>(); + } + + template <typename T1, typename T2> + using dot_product_acc_t = + std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>, + uint32_t, int32_t>; + + template <typename T1, typename T2, typename T3> + inline auto dp4a(T1 a, T2 b, T3 c) + { + dot_product_acc_t<T1, T2> res = c; + auto va = extract_and_sign_or_zero_extend4(a); + auto vb = extract_and_sign_or_zero_extend4(b); + res += va[0] * vb[0]; + res += va[1] * vb[1]; + res += va[2] * vb[2]; + res += va[3] * vb[3]; + return res; + } + + struct sub_sat + { + template <typename T> + auto operator()(const T x, const T y) const + { + return sycl::sub_sat(x, y); + } + }; + + template <typename S, typename T> + inline T vectorized_min(T a, T b) + { + sycl::vec<T, 1> v0{a}, v1{b}; + auto v2 = v0.template as<S>(); + auto v3 = v1.template as<S>(); + auto v4 = sycl::min(v2, v3); + v0 = v4.template as<sycl::vec<T, 1>>(); + return v0; + } + + inline float pow(const float a, const int b) { return sycl::pown(a, b); } + inline double pow(const double a, const int b) { return sycl::pown(a, b); } + inline float pow(const float a, const float b) { return sycl::pow(a, b); } + inline double pow(const double a, const double b) { return sycl::pow(a, b); } + template <typename T, typename U> + inline typename std::enable_if_t<std::is_floating_point_v<T>, T> + pow(const T a, const U b) + { + return sycl::pow(a, static_cast<T>(b)); + } + template <typename T, typename U> + inline typename std::enable_if_t<!std::is_floating_point_v<T>, double> + pow(const T a, const U b) + { + return sycl::pow(static_cast<double>(a), static_cast<double>(b)); + } + + inline double min(const double a, const float b) + { + return sycl::fmin(a, static_cast<double>(b)); + } + inline double min(const float a, const double b) + { + return sycl::fmin(static_cast<double>(a), b); + } + inline float min(const float a, const float b) { return sycl::fmin(a, b); } + inline double min(const double a, const double b) { return sycl::fmin(a, b); } + inline std::uint32_t min(const std::uint32_t a, const std::int32_t b) + { + return sycl::min(a, static_cast<std::uint32_t>(b)); + } + inline std::uint32_t min(const std::int32_t a, const std::uint32_t b) + { + return sycl::min(static_cast<std::uint32_t>(a), b); + } + inline std::int32_t min(const std::int32_t a, const std::int32_t b) + { + return sycl::min(a, b); + } + inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b) + { + return sycl::min(a, b); + } + inline std::uint64_t min(const std::uint64_t a, const std::int64_t b) + { + return sycl::min(a, static_cast<std::uint64_t>(b)); + } + inline std::uint64_t min(const std::int64_t a, const std::uint64_t b) + { + return sycl::min(static_cast<std::uint64_t>(a), b); + } + inline std::int64_t min(const std::int64_t a, const std::int64_t b) + { + return sycl::min(a, b); + } + inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b) + { + return sycl::min(a, b); + } + inline std::uint64_t min(const std::uint64_t a, const std::int32_t b) + { + return sycl::min(a, static_cast<std::uint64_t>(b)); + } + inline std::uint64_t min(const std::int32_t a, const std::uint64_t b) + { + return sycl::min(static_cast<std::uint64_t>(a), b); + } + inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b) + { + return sycl::min(a, static_cast<std::uint64_t>(b)); + } + inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b) + { + return sycl::min(static_cast<std::uint64_t>(a), b); + } + // max function overloads. + // For floating-point types, `float` or `double` arguments are acceptable. + // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or + // `std::int64_t` type arguments are acceptable. + inline double max(const double a, const float b) + { + return sycl::fmax(a, static_cast<double>(b)); + } + inline double max(const float a, const double b) + { + return sycl::fmax(static_cast<double>(a), b); + } + inline float max(const float a, const float b) { return sycl::fmax(a, b); } + inline double max(const double a, const double b) { return sycl::fmax(a, b); } + inline std::uint32_t max(const std::uint32_t a, const std::int32_t b) + { + return sycl::max(a, static_cast<std::uint32_t>(b)); + } + inline std::uint32_t max(const std::int32_t a, const std::uint32_t b) + { + return sycl::max(static_cast<std::uint32_t>(a), b); + } + inline std::int32_t max(const std::int32_t a, const std::int32_t b) + { + return sycl::max(a, b); + } + inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b) + { + return sycl::max(a, b); + } + inline std::uint64_t max(const std::uint64_t a, const std::int64_t b) + { + return sycl::max(a, static_cast<std::uint64_t>(b)); + } + inline std::uint64_t max(const std::int64_t a, const std::uint64_t b) + { + return sycl::max(static_cast<std::uint64_t>(a), b); + } + inline std::int64_t max(const std::int64_t a, const std::int64_t b) + { + return sycl::max(a, b); + } + inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b) + { + return sycl::max(a, b); + } + inline std::uint64_t max(const std::uint64_t a, const std::int32_t b) + { + return sycl::max(a, static_cast<std::uint64_t>(b)); + } + inline std::uint64_t max(const std::int32_t a, const std::uint64_t b) + { + return sycl::max(static_cast<std::uint64_t>(a), b); + } + inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b) + { + return sycl::max(a, static_cast<std::uint64_t>(b)); + } + inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b) + { + return sycl::max(static_cast<std::uint64_t>(a), b); + } + + inline void + has_capability_or_fail(const sycl::device &dev, + const std::initializer_list<sycl::aspect> &props) + { + for (const auto &it : props) + { + if (dev.has(it)) + continue; + switch (it) + { + case sycl::aspect::fp64: + throw std::runtime_error("'double' is not supported in '" + + dev.get_info<sycl::info::device::name>() + + "' device"); + break; + case sycl::aspect::fp16: + throw std::runtime_error("'half' is not supported in '" + + dev.get_info<sycl::info::device::name>() + + "' device"); + break; + default: +#define __SYCL_ASPECT(ASPECT, ID) \ + case sycl::aspect::ASPECT: \ + return #ASPECT; +#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) +#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) + auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string + { + switch (AspectNum) + { +#include <sycl/info/aspects.def> +#include <sycl/info/aspects_deprecated.def> + default: + return "unknown aspect"; + } + }; +#undef __SYCL_ASPECT_DEPRECATED_ALIAS +#undef __SYCL_ASPECT_DEPRECATED +#undef __SYCL_ASPECT + throw std::runtime_error( + "'" + getAspectNameStr(it) + "' is not supported in '" + + dev.get_info<sycl::info::device::name>() + "' device"); + } + break; + } + } + + static inline unsigned int get_current_device_id() + { + return dev_mgr::instance().current_device_id(); + } + + static inline device_ext &get_current_device() + { + return dev_mgr::instance().current_device(); + } + + static inline device_ext &get_device(unsigned int id) + { + return dev_mgr::instance().get_device(id); + } + + static inline sycl::queue &get_in_order_queue() + { + return dev_mgr::instance().current_device().in_order_queue(); + } + + static sycl::event + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction, + const std::vector<sycl::event> &dep_events = {}) + { + if (!size) + return sycl::event{}; + return q.memcpy(to_ptr, from_ptr, size, dep_events); + GGML_UNUSED(direction); + } + + // Get actual copy range and make sure it will not exceed range. + static inline size_t get_copy_range(sycl::range<3> size, size_t slice, + size_t pitch) + { + return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); + } + + static inline size_t get_offset(sycl::id<3> id, size_t slice, + size_t pitch) + { + return slice * id.get(2) + pitch * id.get(1) + id.get(0); + } + + /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr + /// and \p from_range to another specified by \p to_ptr and \p to_range. + static inline std::vector<sycl::event> + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + sycl::range<3> to_range, sycl::range<3> from_range, + sycl::id<3> to_id, sycl::id<3> from_id, + sycl::range<3> size, memcpy_direction direction, + const std::vector<sycl::event> &dep_events = {}) + { + // RAII for host pointer + class host_buffer + { + void *_buf; + size_t _size; + sycl::queue &_q; + const std::vector<sycl::event> &_deps; // free operation depends + + public: + host_buffer(size_t size, sycl::queue &q, + const std::vector<sycl::event> &deps) + : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} + void *get_ptr() const { return _buf; } + size_t get_size() const { return _size; } + ~host_buffer() + { + if (_buf) + { + _q.submit([&](sycl::handler &cgh) + { + cgh.depends_on(_deps); + cgh.host_task([buf = _buf] { std::free(buf); }); }); + } + } + }; + std::vector<sycl::event> event_list; + + size_t to_slice = to_range.get(1) * to_range.get(0), + from_slice = from_range.get(1) * from_range.get(0); + unsigned char *to_surface = + (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); + const unsigned char *from_surface = + (const unsigned char *)from_ptr + + get_offset(from_id, from_slice, from_range.get(0)); + + if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) + { + return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), + direction, dep_events)}; + } + direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction); + size_t size_slice = size.get(1) * size.get(0); + switch (direction) + { + case host_to_host: + for (size_t z = 0; z < size.get(2); ++z) + { + unsigned char *to_ptr = to_surface; + const unsigned char *from_ptr = from_surface; + if (to_range.get(0) == from_range.get(0) && + to_range.get(0) == size.get(0)) + { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, + direction, dep_events)); + } + else + { + for (size_t y = 0; y < size.get(1); ++y) + { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), + direction, dep_events)); + to_ptr += to_range.get(0); + from_ptr += from_range.get(0); + } + } + to_surface += to_slice; + from_surface += from_slice; + } + break; + case host_to_device: + { + host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, + event_list); + std::vector<sycl::event> host_events; + if (to_slice == size_slice) + { + // Copy host data to a temp host buffer with the shape of target. + host_events = + dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, + host_to_host, dep_events); + } + else + { + // Copy host data to a temp host buffer with the shape of target. + host_events = dpct_memcpy( + q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, + // If has padding data, not sure whether it is useless. So fill temp + // buffer with it. + std::vector<sycl::event>{ + dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), + device_to_host, dep_events)}); + } + // Copy from temp host buffer to device with only one submit. + event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), + buf.get_size(), host_to_device, + host_events)); + break; + } + case device_to_host: + { + host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, + event_list); + // Copy from host temp buffer to host target with reshaping. + event_list = dpct_memcpy( + q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), + sycl::id<3>(0, 0, 0), size, host_to_host, + // Copy from device to temp host buffer with only one submit. + std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface, + buf.get_size(), + device_to_host, dep_events)}); + break; + } + case device_to_device: + event_list.push_back(q.submit([&](sycl::handler &cgh) + { + cgh.depends_on(dep_events); + cgh.parallel_for<class dpct_memcpy_3d_detail>( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); })); + break; + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } + return event_list; + } + + /// memcpy 2D/3D matrix specified by pitched_data. + static inline std::vector<sycl::event> + dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, + pitched_data from, sycl::id<3> from_id, sycl::range<3> size, + memcpy_direction direction = automatic) + { + return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), + sycl::range<3>(to.get_pitch(), to.get_y(), 1), + sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, + size, direction); + } + + /// memcpy 2D matrix with pitch. + static inline std::vector<sycl::event> + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + size_t to_pitch, size_t from_pitch, size_t x, size_t y, + memcpy_direction direction = automatic) + { + return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), + sycl::range<3>(from_pitch, y, 1), + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), + sycl::range<3>(x, y, 1), direction); + } + + inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void *a, library_data_t a_type, + int lda, const void *b, library_data_t b_type, int ldb, + const void *beta, void *c, library_data_t c_type, int ldc, + library_data_t scaling_type) + { + if (scaling_type == library_data_t::real_float && + c_type == library_data_t::complex_float) + { + scaling_type = library_data_t::complex_float; + } + else if (scaling_type == library_data_t::real_double && + c_type == library_data_t::complex_double) + { + scaling_type = library_data_t::complex_double; + } + + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) + { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_impl<float, float, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): + { + detail::gemm_impl<double, double, double, double>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_float, library_data_t::complex_float, + library_data_t::complex_float, library_data_t::complex_float): + { + detail::gemm_impl<std::complex<float>, std::complex<float>, + std::complex<float>, std::complex<float>>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_double, library_data_t::complex_double, + library_data_t::complex_double, library_data_t::complex_double): + { + detail::gemm_impl<std::complex<double>, std::complex<double>, + std::complex<double>, std::complex<double>>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): + { + detail::gemm_impl<sycl::half, sycl::half, sycl::half, + sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); + break; + } +#ifdef __INTEL_MKL__ + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, + float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_impl<sycl::half, sycl::half, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): + { + float alpha_value = + dpct::get_value(reinterpret_cast<const float *>(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast<const float *>(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_impl<sycl::half, sycl::half, sycl::half, + sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half, + a, lda, b, ldb, &beta_half, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_impl<std::int8_t, std::int8_t, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): + { + detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, + oneapi::mkl::bfloat16, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): + { + float alpha_float = + dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q); + float beta_float = + dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q); + detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>( + q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc); + break; + } +#endif // __INTEL_MKL__ + default: + throw std::runtime_error("the combination of data type is unsupported"); + } + } // gemm() + + /// Computes a batch of matrix-matrix product with general matrices. + /// \param [in] q The queue where the routine should be executed. + /// \param [in] a_trans Specifies the operation applied to A. + /// \param [in] b_trans Specifies the operation applied to B. + /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. + /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. + /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). + /// \param [in] alpha Scaling factor for the matrix-matrix product. + /// \param [in] a Input matrix A. + /// \param [in] a_type Data type of the matrix A. + /// \param [in] lda Leading dimension of A. + /// \param [in] b Input matrix B. + /// \param [in] b_type Data type of the matrix B. + /// \param [in] ldb Leading dimension of B. + /// \param [in] beta Scaling factor for matrix C. + /// \param [in, out] c Input/Output matrix C. + /// \param [in] c_type Data type of the matrix C. + /// \param [in] ldc Leading dimension of C. + /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. + /// \param [in] scaling_type Data type of the scaling factors. + inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void *a[], + library_data_t a_type, int lda, const void *b[], + library_data_t b_type, int ldb, const void *beta, + void *c[], library_data_t c_type, int ldc, + int batch_size, library_data_t scaling_type) + { + if (scaling_type == library_data_t::real_float && + c_type == library_data_t::complex_float) + { + scaling_type = library_data_t::complex_float; + } + else if (scaling_type == library_data_t::real_double && + c_type == library_data_t::complex_double) + { + scaling_type = library_data_t::complex_double; + } + + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) + { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl<float, float, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): + { + detail::gemm_batch_impl<double, double, double, double>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_float, library_data_t::complex_float, + library_data_t::complex_float, library_data_t::complex_float): + { + detail::gemm_batch_impl<std::complex<float>, std::complex<float>, + std::complex<float>, std::complex<float>>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_double, library_data_t::complex_double, + library_data_t::complex_double, library_data_t::complex_double): + { + detail::gemm_batch_impl<std::complex<double>, std::complex<double>, + std::complex<double>, std::complex<double>>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): + { + detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, + sycl::half>(q, a_trans, b_trans, m, n, k, alpha, + a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } +#ifdef __INTEL_MKL__ + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): + { + detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, + oneapi::mkl::bfloat16, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, + float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, batch_size); + break; + } +#endif + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): + { + float alpha_float = + dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q); + float beta_float = + dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q); + detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, + float>(q, a_trans, b_trans, m, n, k, &alpha_float, + a, lda, b, ldb, &beta_float, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl<sycl::half, sycl::half, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): + { + float alpha_value = + dpct::get_value(reinterpret_cast<const float *>(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast<const float *>(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>( + q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, + batch_size); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } + } + + /// Computes a batch of matrix-matrix product with general matrices. + /// \param [in] q The queue where the routine should be executed. + /// \param [in] a_trans Specifies the operation applied to A. + /// \param [in] b_trans Specifies the operation applied to B. + /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. + /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. + /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). + /// \param [in] alpha Scaling factor for the matrix-matrix product. + /// \param [in] a Input matrix A. + /// \param [in] a_type Data type of the matrix A. + /// \param [in] lda Leading dimension of A. + /// \param [in] stride_a Stride between the different A matrices. + /// \param [in] b Input matrix B. + /// \param [in] b_type Data type of the matrix B. + /// \param [in] ldb Leading dimension of B. + /// \param [in] stride_b Stride between the different B matrices. + /// \param [in] beta Scaling factor for matrix C. + /// \param [in, out] c Input/Output matrix C. + /// \param [in] c_type Data type of the matrix C. + /// \param [in] ldc Leading dimension of C. + /// \param [in] stride_c Stride between the different C matrices. + /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. + /// \param [in] scaling_type Data type of the scaling factors. + inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void *a, library_data_t a_type, + int lda, long long int stride_a, const void *b, + library_data_t b_type, int ldb, long long int stride_b, + const void *beta, void *c, library_data_t c_type, + int ldc, long long int stride_c, int batch_size, + library_data_t scaling_type) + { + if (scaling_type == library_data_t::real_float && + c_type == library_data_t::complex_float) + { + scaling_type = library_data_t::complex_float; + } + else if (scaling_type == library_data_t::real_double && + c_type == library_data_t::complex_double) + { + scaling_type = library_data_t::complex_double; + } + + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) + { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl<float, float, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): + { + detail::gemm_batch_impl<double, double, double, double>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_float, library_data_t::complex_float, + library_data_t::complex_float, library_data_t::complex_float): + { + detail::gemm_batch_impl<std::complex<float>, std::complex<float>, + std::complex<float>, std::complex<float>>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_double, library_data_t::complex_double, + library_data_t::complex_double, library_data_t::complex_double): + { + detail::gemm_batch_impl<std::complex<double>, std::complex<double>, + std::complex<double>, std::complex<double>>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): + { + detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, + sycl::half>(q, a_trans, b_trans, m, n, k, alpha, + a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } +#ifdef __INTEL_MKL__ + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): + { + detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, + oneapi::mkl::bfloat16, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, + float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, + stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size); + break; + } +#endif + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): + { + detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, + std::int32_t>(q, a_trans, b_trans, m, n, k, alpha, + a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl<sycl::half, sycl::half, float, float>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): + { + float alpha_value = + dpct::get_value(reinterpret_cast<const float *>(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast<const float *>(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>( + q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b, + &beta_half, c, ldc, stride_c, batch_size); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } + } + + static inline void + async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr, + size_t from_pitch, size_t x, size_t y, + memcpy_direction direction = automatic, + sycl::queue &q = get_default_queue()) + { + detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y, + direction); + } + + using err0 = detail::generic_error_type<struct err0_tag, int>; + using err1 = detail::generic_error_type<struct err1_tag, int>; + + static inline void dpct_free(void *ptr, sycl::queue &q = get_default_queue()) { + detail::dpct_free(ptr, q); + } + + /// dpct accessor used as device function parameter. + template <class T, memory_region Memory, size_t Dimension> class accessor; + template <class T, memory_region Memory> class accessor<T, Memory, 3> { + public: + using memory_t = detail::memory_traits<Memory, T>; + using element_t = typename memory_t::element_t; + using pointer_t = typename memory_t::pointer_t; + using accessor_t = typename memory_t::template accessor_t<3>; + accessor(pointer_t data, const sycl::range<3> &in_range) + : _data(data), _range(in_range) {} + template <memory_region M = Memory> + accessor(typename std::enable_if<M != local, const accessor_t>::type &acc) + : accessor(acc, acc.get_range()) {} + accessor(const accessor_t &acc, const sycl::range<3> &in_range) + : accessor(acc.get_pointer(), in_range) {} + accessor<T, Memory, 2> operator[](size_t index) const { + sycl::range<2> sub(_range.get(1), _range.get(2)); + return accessor<T, Memory, 2>(_data + index * sub.size(), sub); + } + + pointer_t get_ptr() const { return _data; } + + private: + pointer_t _data; + sycl::range<3> _range; + }; + template <class T, memory_region Memory> class accessor<T, Memory, 2> { + public: + using memory_t = detail::memory_traits<Memory, T>; + using element_t = typename memory_t::element_t; + using pointer_t = typename memory_t::pointer_t; + using accessor_t = typename memory_t::template accessor_t<2>; + accessor(pointer_t data, const sycl::range<2> &in_range) + : _data(data), _range(in_range) {} + template <memory_region M = Memory> + accessor(typename std::enable_if<M != local, const accessor_t>::type &acc) + : accessor(acc, acc.get_range()) {} + accessor(const accessor_t &acc, const sycl::range<2> &in_range) + : accessor(acc.get_pointer(), in_range) {} + + pointer_t operator[](size_t index) const { + return _data + _range.get(1) * index; + } + + pointer_t get_ptr() const { return _data; } + + private: + pointer_t _data; + sycl::range<2> _range; + }; + + namespace detail { + /// Device variable with address space of shared, global or constant. + template <class T, memory_region Memory, size_t Dimension> class device_memory { + public: + using accessor_t = + typename detail::memory_traits<Memory, + T>::template accessor_t<Dimension>; + using value_t = typename detail::memory_traits<Memory, T>::value_t; + using dpct_accessor_t = dpct::accessor<T, Memory, Dimension>; + + device_memory() : device_memory(sycl::range<Dimension>(1)) {} + + /// Constructor of 1-D array with initializer list + device_memory(const sycl::range<Dimension> &in_range, + std::initializer_list<value_t> &&init_list) + : device_memory(in_range) { + assert(init_list.size() <= in_range.size()); + _host_ptr = (value_t *)std::malloc(_size); + std::memset(_host_ptr, 0, _size); + std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T)); + } + + /// Constructor of 2-D array with initializer list + template <size_t D = Dimension> + device_memory( + const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range, + std::initializer_list<std::initializer_list<value_t>> &&init_list) + : device_memory(in_range) { + assert(init_list.size() <= in_range[0]); + _host_ptr = (value_t *)std::malloc(_size); + std::memset(_host_ptr, 0, _size); + auto tmp_data = _host_ptr; + for (auto sub_list : init_list) { + assert(sub_list.size() <= in_range[1]); + std::memcpy(tmp_data, sub_list.begin(), + sub_list.size() * sizeof(T)); + tmp_data += in_range[1]; + } + } + + /// Constructor with range + device_memory(const sycl::range<Dimension> &range_in) + : _size(range_in.size() * sizeof(T)), _range(range_in), + _reference(false), _host_ptr(nullptr), _device_ptr(nullptr) { + static_assert( + (Memory == global) || (Memory == constant) || (Memory == shared), + "device memory region should be global, constant or shared"); + // Make sure that singleton class mem_mgr and dev_mgr will destruct + // later than this. + detail::mem_mgr::instance(); + dev_mgr::instance(); + } + + /// Constructor with range + template <class... Args> + device_memory(Args... Arguments) + : device_memory(sycl::range<Dimension>(Arguments...)) {} + + ~device_memory() { + if (_device_ptr && !_reference) + dpct::dpct_free(_device_ptr); + if (_host_ptr) + std::free(_host_ptr); + } + + /// Allocate memory with default queue, and init memory if has initial + /// value. + void init() { init(dpct::get_default_queue()); } + /// Allocate memory with specified queue, and init memory if has initial + /// value. + void init(sycl::queue &q) { + if (_device_ptr) + return; + if (!_size) + return; + allocate_device(q); + if (_host_ptr) + detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size, + host_to_device); + } + + /// The variable is assigned to a device pointer. + void assign(value_t *src, size_t size) { + this->~device_memory(); + new (this) device_memory(src, size); + } + + /// Get memory pointer of the memory object, which is virtual pointer when + /// usm is not used, and device pointer when usm is used. + value_t *get_ptr() { return get_ptr(get_default_queue()); } + /// Get memory pointer of the memory object, which is virtual pointer when + /// usm is not used, and device pointer when usm is used. + value_t *get_ptr(sycl::queue &q) { + init(q); + return _device_ptr; + } + + /// Get the device memory object size in bytes. + size_t get_size() { return _size; } + + template <size_t D = Dimension> + typename std::enable_if<D == 1, T>::type &operator[](size_t index) { + init(); + return _device_ptr[index]; + } + + /// Get dpct::accessor with dimension info for the device memory object + /// when usm is used and dimension is greater than 1. + template <size_t D = Dimension> + typename std::enable_if<D != 1, dpct_accessor_t>::type + get_access([[maybe_unused]] sycl::handler &cgh) { + return dpct_accessor_t((T *)_device_ptr, _range); + } + + private: + device_memory(value_t *memory_ptr, size_t size) + : _size(size), _range(size / sizeof(T)), _reference(true), + _device_ptr(memory_ptr) {} + + void allocate_device(sycl::queue &q) { + #ifndef DPCT_USM_LEVEL_NONE + if (Memory == shared) { + _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(), + q.get_context()); + return; + } + #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY + if (Memory == constant) { + _device_ptr = (value_t *)sycl::malloc_device( + _size, q.get_device(), q.get_context(), + sycl::ext::oneapi::property::usm::device_read_only()); + return; + } + #endif + #endif + _device_ptr = (value_t *)detail::dpct_malloc(_size, q); + } + + size_t _size; + sycl::range<Dimension> _range; + bool _reference; + value_t *_host_ptr; + value_t *_device_ptr; + }; + template <class T, memory_region Memory> + class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> { + public: + using base = device_memory<T, Memory, 1>; + using value_t = typename base::value_t; + using accessor_t = + typename detail::memory_traits<Memory, T>::template accessor_t<0>; + + /// Constructor with initial value. + device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {} + + /// Default constructor + device_memory() : base(1) {} + }; + } // namespace detail + + template <class T, size_t Dimension> + using global_memory = detail::device_memory<T, global, Dimension>; + template <class T, size_t Dimension> + using constant_memory = detail::device_memory<T, constant, Dimension>; + template <class T, size_t Dimension> + using shared_memory = detail::device_memory<T, shared, Dimension>; + + + template <typename T, + sycl::access::address_space addressSpace = + sycl::access::address_space::global_space, + sycl::memory_order memoryOrder = sycl::memory_order::relaxed, + sycl::memory_scope memoryScope = sycl::memory_scope::device> + inline T atomic_fetch_add(T *addr, T operand) { + auto atm = + sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]); + return atm.fetch_add(operand); + } + + template <sycl::access::address_space addressSpace = + sycl::access::address_space::global_space, + sycl::memory_order memoryOrder = sycl::memory_order::relaxed, + sycl::memory_scope memoryScope = sycl::memory_scope::device, + typename T1, typename T2> + inline T1 atomic_fetch_add(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]); + return atm.fetch_add(operand); + } + + template <typename T, sycl::access::address_space addressSpace = + sycl::access::address_space::global_space> + inline T atomic_fetch_add(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed, + sycl::memory_scope::device>(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel, + sycl::memory_scope::device>(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst, + sycl::memory_scope::device>(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } + } + + template <sycl::access::address_space addressSpace = + sycl::access::address_space::global_space, + typename T1, typename T2> + inline T1 atomic_fetch_add(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder); + } + +} // COPY from DPCT head files + +#endif // GGML_SYCL_DPCT_HELPER_HPP diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp new file mode 100644 index 00000000..3107ba91 --- /dev/null +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -0,0 +1,3031 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "mmq.hpp" +#include "vecdotq.hpp" + +typedef void (*allocate_tiles_sycl_t)( + int** x_ql, + sycl::half2** x_dm, + int** x_qh, + int** x_sc); +typedef void (*load_tiles_sycl_t)( + const void* __restrict__ vx, + int* __restrict__ x_ql, + sycl::half2* __restrict__ x_dm, + int* __restrict__ x_qh, + int* __restrict__ x_sc, + const int& i_offset, + const int& i_max, + const int& k, + const int& blocks_per_row); +typedef float (*vec_dot_q_mul_mat_sycl_t)( + const int* __restrict__ x_ql, + const sycl::half2* __restrict__ x_dm, + const int* __restrict__ x_qh, + const int* __restrict__ x_sc, + const int* __restrict__ y_qs, + const sycl::half2* __restrict__ y_ms, + const int& i, + const int& j, + const int& k); + + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q4_0, float *tile_x_d_q4_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q4_0; + *x_dm = (sycl::half2 *)tile_x_d_q4_0; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_0; + const int kqsx = k % QI4_0; + + const block_q4_0 * bx0 = (const block_q4_0 *) vx; + + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const float * x_dmf = (const float *) x_dm; + + int u[2*VDR_Q4_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + } + + return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ> + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q4_1; + *x_dm = tile_x_dm_q4_1; +} + + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_1; + const int kqsx = k % QI4_1; + + const block_q4_1 * bx0 = (const block_q4_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { + int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + } +} + +static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + + int u[2*VDR_Q4_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + } + + return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ> + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_0, float *tile_x_d_q5_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_ql_q5_0; + *x_dm = (sycl::half2 *)tile_x_d_q5_0; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_0; + const int kqsx = k % QI5_0; + + const block_q5_0 * bx0 = (const block_q5_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8(bxi->qs, kqsx); + const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + qs0 = dpct::vectorized_binary<sycl::char4>( + qs0, 0x10101010, dpct::sub_sat()); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + qs1 = dpct::vectorized_binary<sycl::char4>( + qs1, 0x10101010, dpct::sub_sat()); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { + int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + int u[2*VDR_Q5_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + } + + return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ> + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_ql_q5_1; + *x_dm = tile_x_dm_q5_1; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_1; + const int kqsx = k % QI5_1; + + const block_q5_1 * bx0 = (const block_q5_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { + int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + } +} + +static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; + + int u[2*VDR_Q5_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + } + + return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ> + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q8_0, float *tile_x_d_q8_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q8_0; + *x_dm = (sycl::half2 *)tile_x_d_q8_0; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI8_0; + const int kqsx = k % QI8_0; + float * x_dmf = (float *) x_dm; + + const block_q8_0 * bx0 = (const block_q8_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ> + (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], + y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K, + int *tile_x_sc_q2_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q2_K; + *x_dm = tile_x_dm_q2_K; + *x_sc = tile_x_sc_q2_K; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI2_K; + const int kqsx = k % QI2_K; + + const block_q2_K * bx0 = (const block_q2_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + } +} + +#define VDR_Q2_K_Q8_1_MMQ 2 +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ scales, + const sycl::half2 &dm2, const float &d8) { + + int sumi_d = 0; + int sumi_m = 0; + +#pragma unroll + for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { + int sumi_d_sc = 0; + + const int sc = scales[i0 / (QI8_1/2)]; + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + +#pragma unroll + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = dpct::dp4a(m, u[i], + sumi_m); // multiply sum of q8_1 values with m + } + + sumi_d += sumi_d_sc * (sc & 0xF); + } + + const sycl::float2 dm2f = + dm2.convert<float, sycl::rounding_mode::automatic>(); + + return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m); +} + +static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const int kbx = k / QI2_K; + const int ky = (k % QI2_K) * QR2_K; + const float * y_df = (const float *) y_ds; + + int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; + + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); + +#pragma unroll + for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { + v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; + } + + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K, + int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) { + + *x_ql = tile_x_ql_q3_K; + *x_dm = tile_x_dm_q3_K; + *x_qh = tile_x_qh_q3_K; + *x_sc = tile_x_sc_q3_K; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI3_K; + const int kqsx = k % QI3_K; + + const block_q3_K * bx0 = (const block_q3_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { + int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + + const int ksc = k % (QI3_K/4); + + const int ksc_low = ksc % (QI3_K/8); + const int shift_low = 4 * (ksc / (QI3_K/8)); + const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; + + const int ksc_high = QI3_K/8; + const int shift_high = 2 * ksc; + const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; + + const int sc = dpct::vectorized_binary<sycl::char4>( + sc_low | sc_high, 0x20202020, dpct::sub_sat()); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; + } +} + +#define VDR_Q3_K_Q8_1_MMQ 2 +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d3, + const float &d8) { + + int sumi = 0; + +#pragma unroll + for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { + int sumi_sc = 0; + + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product + } + + sumi += sumi_sc * scales[i0 / (QI8_1/2)]; + } + + return d3*d8 * sumi; +} + +static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + + const int kbx = k / QI3_K; + const int ky = (k % QI3_K) * QR3_K; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + + int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int shift = 2 * ((ky % 32) / 8); + const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; + + const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vlh = (vh << 2) & 0x04040404; + + v[l] = dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat()); + } + + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K, + int *tile_x_sc_q4_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q4_K; + *x_dm = tile_x_dm_q4_K; + *x_sc = tile_x_sc_q4_K; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_K; // == 0 if QK_K == 256 + const int kqsx = k % QI4_K; // == k if QK_K == 256 + + const block_q4_K * bx0 = (const block_q4_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + + +#define VDR_Q4_K_Q8_1_MMQ 8 + +// contiguous u/y values +static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F, + u[i * QI8_1 + j], sumi_d); // SIMD dot product + } + + const sycl::float2 ds8f = + ds8[i].convert<float, sycl::rounding_mode::automatic>(); + + sumf_d += ds8f.x() * (sc[i] * sumi_d); + sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val + } + + const sycl::float2 dm4f = + dm4.convert<float, sycl::rounding_mode::automatic>(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; +} + + +static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); + + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K, + int *tile_x_sc_q5_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q5_K; + *x_dm = tile_x_dm_q5_K; + *x_sc = tile_x_sc_q5_K; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_K; // == 0 if QK_K == 256 + const int kqsx = k % QI5_K; // == k if QK_K == 256 + + const block_q5_K * bx0 = (const block_q5_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR5_K*kqsx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); + const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; + const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; + + const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; + const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +#define VDR_Q5_K_Q8_1_MMQ 8 + +// contiguous u/y values +static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j], + sumi_d); // SIMD dot product + } + + const sycl::float2 ds8f = + ds8[i].convert<float, sycl::rounding_mode::automatic>(); + + sumf_d += ds8f.x() * (sc[i] * sumi_d); + sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val + } + + const sycl::float2 dm4f = + dm4.convert<float, sycl::rounding_mode::automatic>(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; +} + +static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); + + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); +} + +template <int mmq_y> +static __dpct_inline__ void +allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) { + (void)x_qh; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template <int mmq_y, int nwarps, bool need_check> +static __dpct_inline__ void +load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_SYCL_ASSUME(i_offset >= 0); + GGML_SYCL_ASSUME(i_offset < nwarps); + GGML_SYCL_ASSUME(k >= 0); + GGML_SYCL_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + const block_q6_K * bx0 = (const block_q6_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR6_K*kqsx; + + const int ql = get_int_from_uint8(bxi->ql, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); + const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; + const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; + + const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; + const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); + + x_ql[i * (2 * WARP_SIZE + 1) + kq0] = + dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020, + dpct::sub_sat()); + x_ql[i * (2 * WARP_SIZE + 1) + kq1] = + dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020, + dpct::sub_sat()); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + } +} + +#define VDR_Q6_K_Q8_1_MMQ 8 + +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const int8_t *__restrict__ sc, const float &d6, + const float *__restrict__ d8) { + + float sumf_d = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { + sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + +#pragma unroll + for (int i = i0; i < i0 + 2; ++i) { + sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0], + sumi_d.x()); // SIMD dot product + sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1], + sumi_d.x()); // SIMD dot product + + sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4], + sumi_d.y()); // SIMD dot product + sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5], + sumi_d.y()); // SIMD dot product + } + + sumf_d += d8[i0 / 4] * + (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y()); + } + + return d6 * sumf_d; +} + +static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); + + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); +} + +template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, + int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr, + vec_dot_q_mul_mat_sycl_t vec_dot> +/* +DPCT1110:8: The total declared local variable size in device function mul_mat_q +exceeds 128 bytes and may cause high register pressure. Consult with your +hardware vendor to find the total register size available and adjust the code, +or use smaller sub-group size to avoid high register pressure. +*/ +static __dpct_inline__ void +mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, + float *__restrict__ dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh, + int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + const int blocks_per_warp = WARP_SIZE / qi; + + const int & ncols_dst = ncols_y; + + const int row_dst_0 = item_ct1.get_group(2) * mmq_y; + const int & row_x_0 = row_dst_0; + + const int col_dst_0 = item_ct1.get_group(1) * mmq_x; + const int & col_y_0 = col_dst_0; + + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; + + for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { + + load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, + tile_x_qh, tile_x_sc, item_ct1.get_local_id(1), + nrows_x - row_x_0 - 1, item_ct1.get_local_id(2), + blocks_per_row_x); + +#pragma unroll + for (int ir = 0; ir < qr; ++ir) { + const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2); + const int kbxd = kqs / QI8_1; + +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = dpct::min( + (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i), + ncols_y - 1); // to prevent out-of-bounds memory accesses + + const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; + + const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE + + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned( + by0->qs, item_ct1.get_local_id(2) % QI8_1); + } + +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = + (ids0 + item_ct1.get_local_id(1) * QI8_1 + + item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) % + mmq_x; + const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1); + const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const sycl::half2 *dsi_src = + &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) + + ir * (WARP_SIZE / QI8_1) + kby] + .ds; + sycl::half2 *dsi_dst = + &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = (*dsi_src)[0]; + } + } + + /* + DPCT1118:9: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:56: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i / WARP_SIZE][j / nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, + tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i, + item_ct1.get_local_id(1) + j, k); + } + } + } + + /* + DPCT1118:10: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:57: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + } + } + +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1); + + if (col_dst >= ncols_dst) { + return; + } + +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i; + + if (row_dst >= nrows_dst) { + continue; + } + + dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; + } + } +} + +#define MMQ_X_Q4_0_RDNA2 64 +#define MMQ_Y_Q4_0_RDNA2 128 +#define NWARPS_Q4_0_RDNA2 8 +#define MMQ_X_Q4_0_RDNA1 64 +#define MMQ_Y_Q4_0_RDNA1 64 +#define NWARPS_Q4_0_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q4_0_AMPERE 4 +#define MMQ_Y_Q4_0_AMPERE 32 +#define NWARPS_Q4_0_AMPERE 4 +#else +#define MMQ_X_Q4_0_AMPERE 64 +#define MMQ_Y_Q4_0_AMPERE 128 +#define NWARPS_Q4_0_AMPERE 4 +#endif +#define MMQ_X_Q4_0_PASCAL 64 +#define MMQ_Y_Q4_0_PASCAL 64 +#define NWARPS_Q4_0_PASCAL 8 + +template <bool need_check> static void + mul_mat_q4_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + + const int mmq_x = MMQ_X_Q4_0_AMPERE; + const int mmq_y = MMQ_Y_Q4_0_AMPERE; + const int nwarps = NWARPS_Q4_0_AMPERE; + allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q4_0, tile_x_d_q4_0); + mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, + load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, + vec_dot_q4_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q4_1_RDNA2 64 +#define MMQ_Y_Q4_1_RDNA2 128 +#define NWARPS_Q4_1_RDNA2 8 +#define MMQ_X_Q4_1_RDNA1 64 +#define MMQ_Y_Q4_1_RDNA1 64 +#define NWARPS_Q4_1_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q4_1_AMPERE 4 +#define MMQ_Y_Q4_1_AMPERE 32 +#define NWARPS_Q4_1_AMPERE 4 +#else +#define MMQ_X_Q4_1_AMPERE 64 +#define MMQ_Y_Q4_1_AMPERE 128 +#define NWARPS_Q4_1_AMPERE 4 +#endif +#define MMQ_X_Q4_1_PASCAL 64 +#define MMQ_Y_Q4_1_PASCAL 64 +#define NWARPS_Q4_1_PASCAL 8 + +template <bool need_check> static void + mul_mat_q4_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1, + sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q4_1_AMPERE; + const int mmq_y = MMQ_Y_Q4_1_AMPERE; + const int nwarps = NWARPS_Q4_1_AMPERE; + allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q4_1, tile_x_dm_q4_1); + mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, + load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, + vec_dot_q4_1_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q5_0_RDNA2 64 +#define MMQ_Y_Q5_0_RDNA2 128 +#define NWARPS_Q5_0_RDNA2 8 +#define MMQ_X_Q5_0_RDNA1 64 +#define MMQ_Y_Q5_0_RDNA1 64 +#define NWARPS_Q5_0_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q5_0_AMPERE 4 +#define MMQ_Y_Q5_0_AMPERE 32 +#define NWARPS_Q5_0_AMPERE 4 +#else +#define MMQ_X_Q5_0_AMPERE 128 +#define MMQ_Y_Q5_0_AMPERE 64 +#define NWARPS_Q5_0_AMPERE 4 +#endif +#define MMQ_X_Q5_0_PASCAL 64 +#define MMQ_Y_Q5_0_PASCAL 64 +#define NWARPS_Q5_0_PASCAL 8 + +template <bool need_check> static void + mul_mat_q5_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q5_0_AMPERE; + const int mmq_y = MMQ_Y_Q5_0_AMPERE; + const int nwarps = NWARPS_Q5_0_AMPERE; + allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_0, tile_x_d_q5_0); + mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, + load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, + vec_dot_q5_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q5_1_RDNA2 64 +#define MMQ_Y_Q5_1_RDNA2 128 +#define NWARPS_Q5_1_RDNA2 8 +#define MMQ_X_Q5_1_RDNA1 64 +#define MMQ_Y_Q5_1_RDNA1 64 +#define NWARPS_Q5_1_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q5_1_AMPERE 4 +#define MMQ_Y_Q5_1_AMPERE 32 +#define NWARPS_Q5_1_AMPERE 4 +#else +#define MMQ_X_Q5_1_AMPERE 128 +#define MMQ_Y_Q5_1_AMPERE 64 +#define NWARPS_Q5_1_AMPERE 4 +#endif +#define MMQ_X_Q5_1_PASCAL 64 +#define MMQ_Y_Q5_1_PASCAL 64 +#define NWARPS_Q5_1_PASCAL 8 + +template <bool need_check> static void +mul_mat_q5_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1, + sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q5_1_AMPERE; + const int mmq_y = MMQ_Y_Q5_1_AMPERE; + const int nwarps = NWARPS_Q5_1_AMPERE; + allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_1, tile_x_dm_q5_1); + mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, + load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, + vec_dot_q5_1_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q8_0_RDNA2 64 +#define MMQ_Y_Q8_0_RDNA2 128 +#define NWARPS_Q8_0_RDNA2 8 +#define MMQ_X_Q8_0_RDNA1 64 +#define MMQ_Y_Q8_0_RDNA1 64 +#define NWARPS_Q8_0_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q8_0_AMPERE 4 +#define MMQ_Y_Q8_0_AMPERE 32 +#define NWARPS_Q8_0_AMPERE 4 +#else +#define MMQ_X_Q8_0_AMPERE 128 +#define MMQ_Y_Q8_0_AMPERE 64 +#define NWARPS_Q8_0_AMPERE 4 +#endif +#define MMQ_X_Q8_0_PASCAL 64 +#define MMQ_Y_Q8_0_PASCAL 64 +#define NWARPS_Q8_0_PASCAL 8 + +template <bool need_check> static void + mul_mat_q8_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q8_0_AMPERE; + const int mmq_y = MMQ_Y_Q8_0_AMPERE; + const int nwarps = NWARPS_Q8_0_AMPERE; + allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q8_0, tile_x_d_q8_0); + mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, + load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, + vec_dot_q8_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q2_K_RDNA2 64 +#define MMQ_Y_Q2_K_RDNA2 128 +#define NWARPS_Q2_K_RDNA2 8 +#define MMQ_X_Q2_K_RDNA1 128 +#define MMQ_Y_Q2_K_RDNA1 32 +#define NWARPS_Q2_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q2_K_AMPERE 4 +#define MMQ_Y_Q2_K_AMPERE 32 +#define NWARPS_Q2_K_AMPERE 4 +#else +#define MMQ_X_Q2_K_AMPERE 64 +#define MMQ_Y_Q2_K_AMPERE 128 +#define NWARPS_Q2_K_AMPERE 4 +#endif +#define MMQ_X_Q2_K_PASCAL 64 +#define MMQ_Y_Q2_K_PASCAL 64 +#define NWARPS_Q2_K_PASCAL 8 + +template <bool need_check> static void +mul_mat_q2_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K, + sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q2_K_AMPERE; + const int mmq_y = MMQ_Y_Q2_K_AMPERE; + const int nwarps = NWARPS_Q2_K_AMPERE; + allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K); + mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, + load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, + vec_dot_q2_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q3_K_RDNA2 128 +#define MMQ_Y_Q3_K_RDNA2 64 +#define NWARPS_Q3_K_RDNA2 8 +#define MMQ_X_Q3_K_RDNA1 32 +#define MMQ_Y_Q3_K_RDNA1 128 +#define NWARPS_Q3_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q3_K_AMPERE 4 +#define MMQ_Y_Q3_K_AMPERE 32 +#define NWARPS_Q3_K_AMPERE 4 +#else +#define MMQ_X_Q3_K_AMPERE 128 +#define MMQ_Y_Q3_K_AMPERE 128 +#define NWARPS_Q3_K_AMPERE 4 +#endif +#define MMQ_X_Q3_K_PASCAL 64 +#define MMQ_Y_Q3_K_PASCAL 64 +#define NWARPS_Q3_K_PASCAL 8 + +template <bool need_check> static void +mul_mat_q3_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K, + sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q3_K_AMPERE; + const int mmq_y = MMQ_Y_Q3_K_AMPERE; + const int nwarps = NWARPS_Q3_K_AMPERE; + allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K, + tile_x_sc_q3_K); + mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, + load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, + vec_dot_q3_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q4_K_RDNA2 64 +#define MMQ_Y_Q4_K_RDNA2 128 +#define NWARPS_Q4_K_RDNA2 8 +#define MMQ_X_Q4_K_RDNA1 32 +#define MMQ_Y_Q4_K_RDNA1 64 +#define NWARPS_Q4_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q4_K_AMPERE 4 +#define MMQ_Y_Q4_K_AMPERE 32 +#define NWARPS_Q4_K_AMPERE 4 +#else +#define MMQ_X_Q4_K_AMPERE 64 +#define MMQ_Y_Q4_K_AMPERE 128 +#define NWARPS_Q4_K_AMPERE 4 +#endif +#define MMQ_X_Q4_K_PASCAL 64 +#define MMQ_Y_Q4_K_PASCAL 64 +#define NWARPS_Q4_K_PASCAL 8 + +template <bool need_check> static void + mul_mat_q4_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K, + sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q4_K_AMPERE; + const int mmq_y = MMQ_Y_Q4_K_AMPERE; + const int nwarps = NWARPS_Q4_K_AMPERE; + allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K); + mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, + load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, + vec_dot_q4_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q5_K_RDNA2 64 +#define MMQ_Y_Q5_K_RDNA2 128 +#define NWARPS_Q5_K_RDNA2 8 +#define MMQ_X_Q5_K_RDNA1 32 +#define MMQ_Y_Q5_K_RDNA1 64 +#define NWARPS_Q5_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q5_K_AMPERE 4 +#define MMQ_Y_Q5_K_AMPERE 32 +#define NWARPS_Q5_K_AMPERE 4 +#else +#define MMQ_X_Q5_K_AMPERE 64 +#define MMQ_Y_Q5_K_AMPERE 128 +#define NWARPS_Q5_K_AMPERE 4 +#endif +#define MMQ_X_Q5_K_PASCAL 64 +#define MMQ_Y_Q5_K_PASCAL 64 +#define NWARPS_Q5_K_PASCAL 8 + +template <bool need_check> static void +mul_mat_q5_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K, + sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q5_K_AMPERE; + const int mmq_y = MMQ_Y_Q5_K_AMPERE; + const int nwarps = NWARPS_Q5_K_AMPERE; + allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K); + mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, + load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, + vec_dot_q5_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +#define MMQ_X_Q6_K_RDNA2 64 +#define MMQ_Y_Q6_K_RDNA2 128 +#define NWARPS_Q6_K_RDNA2 8 +#define MMQ_X_Q6_K_RDNA1 32 +#define MMQ_Y_Q6_K_RDNA1 64 +#define NWARPS_Q6_K_RDNA1 8 +#if defined(SYCL_USE_XMX) +#define MMQ_X_Q6_K_AMPERE 4 +#define MMQ_Y_Q6_K_AMPERE 32 +#define NWARPS_Q6_K_AMPERE 4 +#else +#define MMQ_X_Q6_K_AMPERE 64 +#define MMQ_Y_Q6_K_AMPERE 64 +#define NWARPS_Q6_K_AMPERE 4 +#endif +#define MMQ_X_Q6_K_PASCAL 64 +#define MMQ_Y_Q6_K_PASCAL 64 +#define NWARPS_Q6_K_PASCAL 8 + +template <bool need_check> static void + mul_mat_q6_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { + // int * tile_x_ql = nullptr; + // sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + // int * tile_x_sc = nullptr; + +//sycl_todo: change according to hardware + const int mmq_x = MMQ_X_Q6_K_AMPERE; + const int mmq_y = MMQ_Y_Q6_K_AMPERE; + const int nwarps = NWARPS_Q6_K_AMPERE; + allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql, tile_x_dm, tile_x_sc); + mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, + load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, + vec_dot_q6_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); +} + +static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q4_0_RDNA2; + mmq_y = MMQ_Y_Q4_0_RDNA2; + nwarps = NWARPS_Q4_0_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q4_0_RDNA1; + mmq_y = MMQ_Y_Q4_0_RDNA1; + nwarps = NWARPS_Q4_0_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q4_0_AMPERE; + mmq_y = MMQ_Y_Q4_0_AMPERE; + nwarps = NWARPS_Q4_0_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q4_0_PASCAL; + mmq_y = MMQ_Y_Q4_0_PASCAL; + nwarps = NWARPS_Q4_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:20: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_qs_q4_0_acc_ct1), + get_pointer(tile_x_d_q4_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:21: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_qs_q4_0_acc_ct1), + get_pointer(tile_x_d_q4_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q4_1_RDNA2; + mmq_y = MMQ_Y_Q4_1_RDNA2; + nwarps = NWARPS_Q4_1_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q4_1_RDNA1; + mmq_y = MMQ_Y_Q4_1_RDNA1; + nwarps = NWARPS_Q4_1_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q4_1_AMPERE; + mmq_y = MMQ_Y_Q4_1_AMPERE; + nwarps = NWARPS_Q4_1_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q4_1_PASCAL; + mmq_y = MMQ_Y_Q4_1_PASCAL; + nwarps = NWARPS_Q4_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:22: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_qs_q4_1_acc_ct1), + get_pointer(tile_x_dm_q4_1_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:23: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_qs_q4_1_acc_ct1), + get_pointer(tile_x_dm_q4_1_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q5_0_RDNA2; + mmq_y = MMQ_Y_Q5_0_RDNA2; + nwarps = NWARPS_Q5_0_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q5_0_RDNA1; + mmq_y = MMQ_Y_Q5_0_RDNA1; + nwarps = NWARPS_Q5_0_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q5_0_AMPERE; + mmq_y = MMQ_Y_Q5_0_AMPERE; + nwarps = NWARPS_Q5_0_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q5_0_PASCAL; + mmq_y = MMQ_Y_Q5_0_PASCAL; + nwarps = NWARPS_Q5_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:24: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q5_0_acc_ct1), + get_pointer(tile_x_d_q5_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:25: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q5_0_acc_ct1), + get_pointer(tile_x_d_q5_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q5_1_RDNA2; + mmq_y = MMQ_Y_Q5_1_RDNA2; + nwarps = NWARPS_Q5_1_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q5_1_RDNA1; + mmq_y = MMQ_Y_Q5_1_RDNA1; + nwarps = NWARPS_Q5_1_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q5_1_AMPERE; + mmq_y = MMQ_Y_Q5_1_AMPERE; + nwarps = NWARPS_Q5_1_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q5_1_PASCAL; + mmq_y = MMQ_Y_Q5_1_PASCAL; + nwarps = NWARPS_Q5_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:26: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q5_1_acc_ct1), + get_pointer(tile_x_dm_q5_1_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:27: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q5_1_acc_ct1), + get_pointer(tile_x_dm_q5_1_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q8_0_RDNA2; + mmq_y = MMQ_Y_Q8_0_RDNA2; + nwarps = NWARPS_Q8_0_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q8_0_RDNA1; + mmq_y = MMQ_Y_Q8_0_RDNA1; + nwarps = NWARPS_Q8_0_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q8_0_AMPERE; + mmq_y = MMQ_Y_Q8_0_AMPERE; + nwarps = NWARPS_Q8_0_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q8_0_PASCAL; + mmq_y = MMQ_Y_Q8_0_PASCAL; + nwarps = NWARPS_Q8_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:28: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_qs_q8_0_acc_ct1), + get_pointer(tile_x_d_q8_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:29: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0), + cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_qs_q8_0_acc_ct1), + get_pointer(tile_x_d_q8_0_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q2_K_RDNA2; + mmq_y = MMQ_Y_Q2_K_RDNA2; + nwarps = NWARPS_Q2_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q2_K_RDNA1; + mmq_y = MMQ_Y_Q2_K_RDNA1; + nwarps = NWARPS_Q2_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q2_K_AMPERE; + mmq_y = MMQ_Y_Q2_K_AMPERE; + nwarps = NWARPS_Q2_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q2_K_PASCAL; + mmq_y = MMQ_Y_Q2_K_PASCAL; + nwarps = NWARPS_Q2_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:30: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K), + cgh); + sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q2_K_acc_ct1), + get_pointer(tile_x_dm_q2_K_acc_ct1), + get_pointer(tile_x_sc_q2_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:31: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K), + cgh); + sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q2_K_acc_ct1), + get_pointer(tile_x_dm_q2_K_acc_ct1), + get_pointer(tile_x_sc_q2_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + +#if QK_K == 256 + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q3_K_RDNA2; + mmq_y = MMQ_Y_Q3_K_RDNA2; + nwarps = NWARPS_Q3_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q3_K_RDNA1; + mmq_y = MMQ_Y_Q3_K_RDNA1; + nwarps = NWARPS_Q3_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q3_K_AMPERE; + mmq_y = MMQ_Y_Q3_K_AMPERE; + nwarps = NWARPS_Q3_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q3_K_PASCAL; + mmq_y = MMQ_Y_Q3_K_PASCAL; + nwarps = NWARPS_Q3_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:32: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K), + cgh); + sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh); + sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q3_K_acc_ct1), + get_pointer(tile_x_dm_q3_K_acc_ct1), + get_pointer(tile_x_qh_q3_K_acc_ct1), + get_pointer(tile_x_sc_q3_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:33: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K), + cgh); + sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh); + sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q3_K_acc_ct1), + get_pointer(tile_x_dm_q3_K_acc_ct1), + get_pointer(tile_x_qh_q3_K_acc_ct1), + get_pointer(tile_x_sc_q3_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +#endif +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q4_K_RDNA2; + mmq_y = MMQ_Y_Q4_K_RDNA2; + nwarps = NWARPS_Q4_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q4_K_RDNA1; + mmq_y = MMQ_Y_Q4_K_RDNA1; + nwarps = NWARPS_Q4_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q4_K_AMPERE; + mmq_y = MMQ_Y_Q4_K_AMPERE; + nwarps = NWARPS_Q4_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q4_K_PASCAL; + mmq_y = MMQ_Y_Q4_K_PASCAL; + nwarps = NWARPS_Q4_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:34: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K), + cgh); + sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q4_K_acc_ct1), + get_pointer(tile_x_dm_q4_K_acc_ct1), + get_pointer(tile_x_sc_q4_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:35: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K), + cgh); + sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q4_K_acc_ct1), + get_pointer(tile_x_dm_q4_K_acc_ct1), + get_pointer(tile_x_sc_q4_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q5_K_RDNA2; + mmq_y = MMQ_Y_Q5_K_RDNA2; + nwarps = NWARPS_Q5_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q5_K_RDNA1; + mmq_y = MMQ_Y_Q5_K_RDNA1; + nwarps = NWARPS_Q5_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q5_K_AMPERE; + mmq_y = MMQ_Y_Q5_K_AMPERE; + nwarps = NWARPS_Q5_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q5_K_PASCAL; + mmq_y = MMQ_Y_Q5_K_PASCAL; + nwarps = NWARPS_Q5_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:36: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K), + cgh); + sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q5_K_acc_ct1), + get_pointer(tile_x_dm_q5_K_acc_ct1), + get_pointer(tile_x_sc_q5_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:37: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K), + cgh); + sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_q5_K_acc_ct1), + get_pointer(tile_x_dm_q5_K_acc_ct1), + get_pointer(tile_x_sc_q5_K_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const int compute_capability = ggml_sycl_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= VER_GEN13) { + mmq_x = MMQ_X_Q6_K_RDNA2; + mmq_y = MMQ_Y_Q6_K_RDNA2; + nwarps = NWARPS_Q6_K_RDNA2; + } else if (compute_capability >= VER_GEN12) { + mmq_x = MMQ_X_Q6_K_RDNA1; + mmq_y = MMQ_Y_Q6_K_RDNA1; + nwarps = NWARPS_Q6_K_RDNA1; + } else if (compute_capability >= VER_GEN9) { + mmq_x = MMQ_X_Q6_K_AMPERE; + mmq_y = MMQ_Y_Q6_K_AMPERE; + nwarps = NWARPS_Q6_K_AMPERE; + } else if (compute_capability >= VER_4VEC) { + mmq_x = MMQ_X_Q6_K_PASCAL; + mmq_y = MMQ_Y_Q6_K_PASCAL; + nwarps = NWARPS_Q6_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:38: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K), + cgh); + sycl::local_accessor<int, 1> tile_x_sc_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_acc_ct1), + get_pointer(tile_x_dm_acc_ct1), + get_pointer(tile_x_sc_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:39: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<int, 1> tile_x_ql_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K), + cgh); + sycl::local_accessor<int, 1> tile_x_sc_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor<int, 1> tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K<need_check>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + get_pointer(tile_x_ql_acc_ct1), + get_pointer(tile_x_dm_acc_ct1), + get_pointer(tile_x_sc_acc_ct1), + get_pointer(tile_y_qs_acc_ct1), + get_pointer(tile_y_ds_acc_ct1)); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_op_mul_mat_q( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) try { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + + int device_id; + SYCL_CHECK( + CHECK_TRY_ERROR(device_id = get_current_device_id())); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into + const int64_t nrows_dst = device_id == ctx.device ? ne0 : row_diff; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + ggml_mul_mat_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_1: + ggml_mul_mat_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_0: + ggml_mul_mat_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_1: + ggml_mul_mat_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q8_0: + ggml_mul_mat_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q2_K: + ggml_mul_mat_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q3_K: + ggml_mul_mat_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_K: + ggml_mul_mat_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_K: + ggml_mul_mat_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q6_K: + ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddf_i; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} diff --git a/ggml/src/ggml-sycl/mmq.hpp b/ggml/src/ggml-sycl/mmq.hpp new file mode 100644 index 00000000..3f5297aa --- /dev/null +++ b/ggml/src/ggml-sycl/mmq.hpp @@ -0,0 +1,33 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_MMQ_HPP +#define GGML_SYCL_MMQ_HPP + +#include "common.hpp" + +void ggml_sycl_op_mul_mat_q( + ggml_backend_sycl_context & ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, + ggml_tensor* dst, + const char* src0_dd_i, + const float* src1_ddf_i, + const char* src1_ddq_i, + float* dst_dd_i, + const int64_t row_low, + const int64_t row_high, + const int64_t src1_ncols, + const int64_t src1_padded_row_size, + const dpct::queue_ptr& stream); + +#endif // GGML_SYCL_MMQ_HPP diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp new file mode 100644 index 00000000..3fbc4dd6 --- /dev/null +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -0,0 +1,1027 @@ +#include "mmvq.hpp" +#include "vecdotq.hpp" + + +template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl> +static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + + +template <int qk, int qi, typename block_q_t, int vdr> +static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, + VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_1 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, + VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, + VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_1 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, + VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, + VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK_K, QI2_K, block_q2_K, + VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK_K, QI3_K, block_q3_K, + VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK_K, QI4_K, block_q4_K, + VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK_K, QI5_K, block_q5_K, + VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q<QK_K, QI6_K, block_q6_K, + VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + + +static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq3s_grid_ptr_ct1 = &iq3s_grid[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_NL == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +void ggml_sycl_op_mul_mat_vec_q( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_col_size, + const dpct::queue_ptr &stream) { + + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + const size_t q8_1_ts = sizeof(block_q8_1); + const size_t q8_1_bs = QK8_1; + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff; + for (int i = 0; i < src1_ncols; i++) + { + const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs; + const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset; + float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0]; + switch (src0->type) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } + } + (void) src1; + (void) dst; + (void) src1_ddf_i; +} diff --git a/ggml/src/ggml-sycl/mmvq.hpp b/ggml/src/ggml-sycl/mmvq.hpp new file mode 100644 index 00000000..049b43d4 --- /dev/null +++ b/ggml/src/ggml-sycl/mmvq.hpp @@ -0,0 +1,27 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_MMVQ_HPP +#define GGML_SYCL_MMVQ_HPP + +#include "common.hpp" + + +void ggml_sycl_op_mul_mat_vec_q( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream); + +#endif // GGML_SYCL_MMVQ_HPP diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp new file mode 100644 index 00000000..cccf87d0 --- /dev/null +++ b/ggml/src/ggml-sycl/norm.cpp @@ -0,0 +1,374 @@ +#include "norm.hpp" + +static void norm_f32(const float* x, float* dst, const int ncols, const float eps, + const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + + const int nthreads = item_ct1.get_local_range(2); + const int nwarps = nthreads / WARP_SIZE; + assert(nwarps % WARP_SIZE == 0); + sycl::float2 mean_var = sycl::float2(0.f, 0.f); + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row * ncols + col]; + mean_var.x() += xi; + mean_var.y() += xi * xi; + } + + // sum up partial sums + mean_var = warp_reduce_sum(mean_var, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = mean_var; + } + /* + DPCT1118:0: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + mean_var = 0.f; + int nreduce = nwarps / WARP_SIZE; + for (size_t i = 0; i < nreduce; i += 1) + { + mean_var += s_sum[lane_id + i * WARP_SIZE]; + } + mean_var = warp_reduce_sum(mean_var, item_ct1); + } + + const float mean = mean_var.x() / ncols; + const float var = mean_var.y() / ncols - mean * mean; + const float inv_std = sycl::rsqrt(var + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std; + } +} + +static void group_norm_f32(const float* x, float* dst, const int group_size, const int ne_elements, const float eps, + const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { + int start = item_ct1.get_group(2) * group_size; + int end = start + group_size; + const int nthreads = item_ct1.get_local_range(2); + const int nwarps = nthreads / WARP_SIZE; + assert(nwarps % WARP_SIZE == 0); + start += item_ct1.get_local_id(2); + int nreduce = nwarps / WARP_SIZE; + + if (end >= ne_elements) { + end = ne_elements; + } + + float tmp = 0.0f; // partial sum for thread in warp + + for (int j = start; j < end; j += block_size) { + tmp += x[j]; + } + + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:1: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:54: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + tmp = 0.f; + for (size_t i = 0; i < nreduce; i += 1) + { + tmp += s_sum[lane_id + i * WARP_SIZE]; + } + tmp = warp_reduce_sum(tmp, item_ct1); + } + + float mean = tmp / group_size; + tmp = 0.0f; + + for (int j = start; j < end; j += block_size) { + float xi = x[j] - mean; + dst[j] = xi; + tmp += xi * xi; + } + + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:2: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:55: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + tmp = 0.f; + for (size_t i = 0; i < nreduce; i += 1) + { + tmp += s_sum[lane_id + i * WARP_SIZE]; + } + tmp = warp_reduce_sum(tmp, item_ct1); + } + + float variance = tmp / group_size; + float scale = sycl::rsqrt(variance + eps); + for (int j = start; j < end; j += block_size) { + dst[j] *= scale; + } +} + +static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps, + const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + const int nthreads = item_ct1.get_local_range(2); + const int nwarps = nthreads / WARP_SIZE; + assert(nwarps % WARP_SIZE == 0); + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row * ncols + col]; + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:3: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + int nreduce = nwarps / WARP_SIZE; + tmp = 0.f; + for (size_t i = 0; i < nreduce; i += 1) + { + tmp += s_sum[lane_id + i * WARP_SIZE]; + } + tmp = warp_reduce_sum(tmp, item_ct1); + } + + const float mean = tmp / ncols; + const float scale = sycl::rsqrt(mean + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row * ncols + col] = scale * x[row * ncols + col]; + } +} + +static void norm_f32_sycl(const float* x, float* dst, const int ncols, + const int nrows, const float eps, + queue_ptr stream, int device) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + norm_f32(x, dst, ncols, eps, item_ct1, + nullptr, WARP_SIZE); + }); + }); + } + else { + const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; + const sycl::range<3> block_dims(1, 1, work_group_size); + /* + DPCT1049:17: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1( + sycl::range<1>(work_group_size / WARP_SIZE), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + norm_f32(x, dst, ncols, eps, item_ct1, + get_pointer(s_sum_acc_ct1), work_group_size); + }); + }); + } +} + +static void group_norm_f32_sycl(const float* x, float* dst, + const int num_groups, const int group_size, + const int ne_elements, queue_ptr stream, int device) { + static const float eps = 1e-6f; + if (group_size < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler& cgh) { + const float eps_ct4 = eps; + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + group_norm_f32( + x, dst, group_size, ne_elements, eps_ct4, item_ct1, + nullptr, WARP_SIZE); + }); + }); + } + else { + const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; + const sycl::range<3> block_dims(1, 1, work_group_size); + /* + DPCT1049:18: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), + cgh); + + const float eps_ct4 = eps; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + group_norm_f32(x, dst, group_size, ne_elements, + eps_ct4, item_ct1, + get_pointer(s_sum_acc_ct1), work_group_size); + }); + }); + } +} + +static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, + const int nrows, const float eps, + queue_ptr stream, int device) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + rms_norm_f32(x, dst, ncols, eps, item_ct1, + nullptr, WARP_SIZE); + }); + }); + } + else { + const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; + const sycl::range<3> block_dims(1, 1, work_group_size); + /* + DPCT1049:19: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), + cgh); + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + rms_norm_f32(x, dst, ncols, eps, item_ct1, + get_pointer(s_sum_acc_ct1), work_group_size); + }); + }); + } +} + +void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst, const float* src0_dd, + const float* src1_dd, float* dst_dd, + const queue_ptr& main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); + + (void)src1; + (void)dst; + (void)src1_dd; +} + +void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + int num_groups = dst->op_params[0]; + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); + group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device); + + (void)src1; + (void)dst; + (void)src1_dd; +} + +void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); + + (void)src1; + (void)dst; + (void)src1_dd; +} diff --git a/ggml/src/ggml-sycl/norm.hpp b/ggml/src/ggml-sycl/norm.hpp new file mode 100644 index 00000000..a9ad9156 --- /dev/null +++ b/ggml/src/ggml-sycl/norm.hpp @@ -0,0 +1,35 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_NORM_HPP +#define GGML_SYCL_NORM_HPP + +#include "common.hpp" + +void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst, const float* src0_dd, + const float* src1_dd, float* dst_dd, + const queue_ptr& main_stream); + +void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream); + +void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream); + +#endif // GGML_SYCL_NORM_HPP diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp new file mode 100644 index 00000000..15ddcac1 --- /dev/null +++ b/ggml/src/ggml-sycl/presets.hpp @@ -0,0 +1,66 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_PRESETS_HPP +#define GGML_SYCL_PRESETS_HPP + +#define GGML_SYCL_MAX_STREAMS 8 +#define GGML_SYCL_MAX_BUFFERS 256 + +#define WARP_SIZE GGML_SYCL_WARP_SIZE +#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses + +#define SYCL_GELU_BLOCK_SIZE 256 +#define SYCL_SILU_BLOCK_SIZE 256 +#define SYCL_TANH_BLOCK_SIZE 256 +#define SYCL_RELU_BLOCK_SIZE 256 +#define SYCL_HARDSIGMOID_BLOCK_SIZE 256 +#define SYCL_HARDSWISH_BLOCK_SIZE 256 +#define SYCL_SQR_BLOCK_SIZE 256 +#define SYCL_CPY_BLOCK_SIZE 32 +#define SYCL_SCALE_BLOCK_SIZE 256 +#define SYCL_CLAMP_BLOCK_SIZE 256 +#define SYCL_ROPE_BLOCK_SIZE 256 +#define SYCL_ALIBI_BLOCK_SIZE 32 +#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32 +#define SYCL_QUANTIZE_BLOCK_SIZE 256 +#define SYCL_DEQUANTIZE_BLOCK_SIZE 256 +#define SYCL_GET_ROWS_BLOCK_SIZE 256 +#define SYCL_UPSCALE_BLOCK_SIZE 256 +#define SYCL_CONCAT_BLOCK_SIZE 256 +#define SYCL_PAD_BLOCK_SIZE 256 +#define SYCL_ACC_BLOCK_SIZE 256 +#define SYCL_IM2COL_BLOCK_SIZE 256 +#define SYCL_POOL2D_BLOCK_SIZE 256 + +// dmmv = dequantize_mul_mat_vec +#ifndef GGML_SYCL_DMMV_X +#define GGML_SYCL_DMMV_X 32 +#endif +#ifndef GGML_SYCL_MMV_Y +#define GGML_SYCL_MMV_Y 1 +#endif + +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 2 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + +#ifndef GGML_SYCL_PEER_MAX_BATCH_SIZE +#define GGML_SYCL_PEER_MAX_BATCH_SIZE 128 +#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE + +#define MUL_MAT_SRC1_COL_STRIDE 128 + +#define QK_WARP_SIZE 32 +#endif // GGML_SYCL_PRESETS_HPP diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp new file mode 100644 index 00000000..6f507941 --- /dev/null +++ b/ggml/src/ggml-sycl/rope.cpp @@ -0,0 +1,275 @@ +#include "rope.hpp" + +struct rope_corr_dims { + float v[2]; +}; + +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low); + return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale); + } + *cos_theta = sycl::cos(theta) * mscale; + *sin_theta = sycl::sin(theta) * mscale; +} + +template<typename T, bool has_ff> +static void rope_norm( + const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, + const sycl::nd_item<3> &item_ct1) { + const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (i0 >= ne0) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i0 >= n_dims) { + const int i = row*ne0 + i0; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ne0 + i0; + const int i2 = row/p_delta_rows; + + const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f); + + const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; + + float cos_theta; + float sin_theta; + + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + 1]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + 1] = x0*sin_theta + x1*cos_theta; +} + +template<typename T, bool has_ff> +static void rope_neox( + const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, + const sycl::nd_item<3> &item_ct1) { + const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (i0 >= ne0) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i0 >= n_dims) { + const int i = row*ne0 + i0; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ne0 + i0/2; + const int i2 = row/p_delta_rows; + + const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f); + + const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; + + float cos_theta; + float sin_theta; + + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + n_dims/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; +} + +template <typename T> +static void rope_norm_sycl( + const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) { + GGML_ASSERT(ne0 % 2 == 0); + const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nr); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + if (freq_factors == nullptr) { + /* + DPCT1049:40: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, + ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } else { + /* + DPCT1049:41: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, + ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } +} + +template <typename T> +static void rope_neox_sycl( + const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) { + GGML_ASSERT(ne0 % 2 == 0); + const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nr); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + if (freq_factors == nullptr) { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } else { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } +} + +void ggml_sycl_op_rope( + ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) { + const ggml_tensor * src2 = dst->src[2]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t nr = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + //const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + + // RoPE alteration for extended context + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + const bool is_neox = mode & 2; + + const int32_t * pos = (const int32_t *) src1_dd; + + const float * freq_factors = nullptr; + if (src2 != nullptr) { + freq_factors = (const float *) src2->data; + } + + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); + + // compute + if (is_neox) { + if (src0->type == GGML_TYPE_F32) { + rope_neox_sycl( + (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_neox_sycl( + (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else { + GGML_ASSERT(false); + } + } else { + if (src0->type == GGML_TYPE_F32) { + rope_norm_sycl( + (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_norm_sycl( + (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else { + GGML_ASSERT(false); + } + } + + (void) src1; + (void) dst; + (void) src1_dd; +} diff --git a/ggml/src/ggml-sycl/rope.hpp b/ggml/src/ggml-sycl/rope.hpp new file mode 100644 index 00000000..00354c31 --- /dev/null +++ b/ggml/src/ggml-sycl/rope.hpp @@ -0,0 +1,22 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_ROPE_HPP +#define GGML_SYCL_ROPE_HPP + +#include "common.hpp" + +void ggml_sycl_op_rope( + ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); + +#endif // GGML_SYCL_ROPE_HPP diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp new file mode 100644 index 00000000..17a542e4 --- /dev/null +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -0,0 +1,251 @@ +#include "norm.hpp" + +template <bool vals_smem, int ncols_template, int block_size_template> +static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par, + const int nrows_y, const float scale, const float max_bias, const float m0, + const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) { + const int ncols = ncols_template == 0 ? ncols_par : ncols_template; + + const int tid = item_ct1.get_local_id(2); + const int rowx = item_ct1.get_group(2); + const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension + + const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template; + + const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + const int nthreads = block_size; + const int nwarps = nthreads / WARP_SIZE; + int nreduce = nwarps / WARP_SIZE; + float slope = 1.0f; + + // ALiBi + if (max_bias > 0.0f) { + const uint32_t h = rowx/nrows_y; // head index + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = sycl::pow(base, float(exp)); + } + + float *vals = vals_smem ? buf + std::max(nwarps, WARP_SIZE) : dst + rowx * ncols; + float max_val = -INFINITY; + + for (int col0 = 0; col0 < ncols; col0 += block_size) { + const int col = col0 + tid; + + if (ncols_template == 0 && col >= ncols) { + break; + } + + const int ix = rowx*ncols + col; + const int iy = rowy*ncols + col; + + const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f); + + vals[col] = val; + max_val = sycl::max(max_val, val); + } + + // find the max value in the block + max_val = warp_reduce_max(max_val, item_ct1); + if (block_size > WARP_SIZE) { + if (warp_id == 0) { + buf[lane_id] = -INFINITY; + for (size_t i = 1; i < nreduce; i += 1) + buf[lane_id + i * WARP_SIZE] = -INFINITY; + } + item_ct1.barrier(sycl::access::fence_space::local_space); + + if (lane_id == 0) { + buf[warp_id] = max_val; + } + item_ct1.barrier(sycl::access::fence_space::local_space); + max_val = buf[lane_id]; + for (size_t i = 1; i < nreduce; i += 1) + { + max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]); + } + max_val = warp_reduce_max(max_val, item_ct1); + } + + float tmp = 0.f; +#pragma unroll + for (int col0 = 0; col0 < ncols; col0 += block_size) { + const int col = col0 + tid; + if (ncols_template == 0 && col >= ncols) { + break; + } + + const float val = sycl::native::exp(vals[col] - max_val); + tmp += val; + vals[col] = val; + } + + // find the sum of exps in the block + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + item_ct1.barrier(sycl::access::fence_space::local_space); + if (warp_id == 0) { + buf[lane_id] = 0.f; + for (size_t i = 1; i < nreduce; i += 1) + buf[lane_id + i * WARP_SIZE] = 0.f; + } + item_ct1.barrier(sycl::access::fence_space::local_space); + + if (lane_id == 0) { + buf[warp_id] = tmp; + } + item_ct1.barrier(sycl::access::fence_space::local_space); + + tmp = buf[lane_id]; + for (size_t i = 1; i < nreduce; i += 1) + { + tmp += buf[lane_id + i * WARP_SIZE]; + } + tmp = warp_reduce_sum(tmp, item_ct1); + } + + const float inv_sum = 1.f / tmp; + +#pragma unroll + for (int col0 = 0; col0 < ncols; col0 += block_size) { + const int col = col0 + tid; + + if (ncols_template == 0 && col >= ncols) { + return; + } + + const int idst = rowx*ncols + col; + dst[idst] = vals[col] * inv_sum; + } +} + +template <bool vals_smem, int ncols_template, int block_size_template> +static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par, + const int nrows_y, const float scale, const float max_bias, const float m0, + const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims, + const size_t n_local_scratch, queue_ptr stream) { + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par, + nrows_y, scale, max_bias, m0, + m1, n_head_log2, item_ct1, + get_pointer(local_buf_acc)); + }); + }); +} + +static void soft_max_f32_sycl(const float * x, const float * mask, + float * dst, const int ncols_x, const int nrows_x, + const int nrows_y, const float scale, const float max_bias, + queue_ptr stream, int device) { + int nth = WARP_SIZE; + int max_block_size = ggml_sycl_info().max_work_group_sizes[device]; + while (nth < ncols_x && nth < max_block_size) nth *= 2; + if (nth>max_block_size) nth = max_block_size; + + const sycl::range<3> block_dims(1, 1, nth); + const sycl::range<3> block_nums(1, 1, nrows_x); + const size_t n_val_tmp = nth / WARP_SIZE; + const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + n_val_tmp); + + const uint32_t n_head_kv = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>(); + if (n_local_scratch*sizeof(float) < local_mem_size) { + if (ncols_x > max_block_size) { + soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + return; + } + switch (ncols_x) { + case 32: + soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + case 64: + soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + case 128: + soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + case 256: + soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + case 512: + soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + case 1024: + soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + case 2048: + soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + case 4096: + soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + default: + soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + break; + } + } else { + soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, WARP_SIZE, stream); + } +} + +void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + +#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support") +#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows_x = ggml_nrows(src0); + const int64_t nrows_y = src0->ne[1]; + + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, dst->op_params + 1, sizeof(float)); + + soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, + nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); +} diff --git a/ggml/src/ggml-sycl/softmax.hpp b/ggml/src/ggml-sycl/softmax.hpp new file mode 100644 index 00000000..bdb8f712 --- /dev/null +++ b/ggml/src/ggml-sycl/softmax.hpp @@ -0,0 +1,24 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_SOFTMAX_HPP +#define GGML_SYCL_SOFTMAX_HPP + +#include "common.hpp" + +void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const queue_ptr &main_stream); + +#endif // GGML_SYCL_SOFTMAX_HPP diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp new file mode 100644 index 00000000..d2dccade --- /dev/null +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -0,0 +1,1140 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_VECDOTQ_HPP +#define GGML_SYCL_VECDOTQ_HPP + +#include "dpct/helper.hpp" + +typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); + +static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) { + const uint16_t* x16 = + (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte + // alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __dpct_inline__ int get_int_from_uint8( + const uint8_t* x8, + const int& i32) { + const uint16_t* x16 = + (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte + // alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __dpct_inline__ int get_int_from_int8_aligned( + const int8_t* x8, + const int& i32) { + return *( + (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __dpct_inline__ int get_int_from_uint8_aligned( + const uint8_t* x8, + const int& i32) { + return *( + (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, + const uint8_t *values, + int &val1, int &val2) { + + uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; + aux32 = q4 & 0x0f0f0f0f; + uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); + uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); + val1 = v1 | (v2 << 16); + aux32 = (q4 >> 4) & 0x0f0f0f0f; + v1 = values[q8[0]] | (values[q8[1]] << 8); + v2 = values[q8[2]] | (values[q8[3]] << 8); + val2 = v1 | (v2 << 16); +} + +#define VDR_Q2_K_Q8_1_MMVQ 1 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( + const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales, + const sycl::half2 &dm2, const float *__restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += + d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + sumf_m += d8[i] * + dpct::dp4a( + m, u[i], + 0); // multiply constant q2_K part with sum of q8_1 values + } + + const sycl::float2 dm2f = + dm2.convert<float, sycl::rounding_mode::automatic>(); + + return dm2f.x() * sumf_d - dm2f.y() * sumf_m; +} + + +#define VDR_Q3_K_Q8_1_MMVQ 1 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( + const int &vl, const int &vh, const int *__restrict__ u, + const uint8_t *__restrict__ scales, const int &scale_offset, + const float &d3, const float *__restrict__ d8) { + + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = + dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat()); + + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d3 * sumf; +} + +#define VDR_Q4_K_Q8_1_MMVQ 2 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const float *__restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = + dpct::dp4a(v1i, u[2 * i + 1], + dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product + const int dot2 = + dpct::dp4a(0x01010101, u[2 * i + 1], + dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + const sycl::float2 dm4f = + dm4.convert<float, sycl::rounding_mode::automatic>(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; +} + + +#define VDR_Q5_K_Q8_1_MMVQ 2 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( + const int *__restrict__ vl, const int *__restrict__ vh, + const int *__restrict__ u, const uint8_t *__restrict__ sc, + const uint8_t *__restrict__ m, const sycl::half2 &dm5, + const float *__restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; + + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; + + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; + + const int dot1 = + dpct::dp4a(v0i, u[2 * i + 0], + dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product + const int dot2 = + dpct::dp4a(0x01010101, u[2 * i + 0], + dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); + + } + + const sycl::float2 dm5f = + dm5.convert<float, sycl::rounding_mode::automatic>(); + + return dm5f.x() * sumf_d - dm5f.y() * sumf_m; +} + + +#define VDR_Q6_K_Q8_1_MMVQ 1 + +// contiguous v/x values +static __dpct_inline__ float +vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, + const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d, + const float *__restrict__ d8) { + + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = dpct::vectorized_binary<sycl::char4>( + (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32 + + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; +} + +// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called +// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q + +#define VDR_Q4_0_Q8_1_MMVQ 2 +#define VDR_Q4_0_Q8_1_MMQ 4 + +template <int vdr> +static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, + const float &d4, + const sycl::half2 &ds8) { + int sumi = 0; +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); + } + + const sycl::float2 ds8f = + ds8.convert<float, sycl::rounding_mode::automatic>(); + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y()); +} + +#define VDR_Q4_1_Q8_1_MMVQ 2 +#define VDR_Q4_1_Q8_1_MMQ 4 + +template <int vdr> +static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u, + const sycl::half2 &dm4, + const sycl::half2 &ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); + } + +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm4 * ds8).convert<float, sycl::rounding_mode::automatic>(); + const float d4d8 = tmp.x(); + const float m4s8 = tmp.y(); +#else + const sycl::float2 dm4f = + dm4.convert<float, sycl::rounding_mode::automatic>(); + const sycl::float2 ds8f = + ds8.convert<float, sycl::rounding_mode::automatic>(); + const float d4d8 = dm4f.x() * ds8f.x(); + const float m4s8 = dm4f.y() * ds8f.y(); +#endif // GGML_SYCL_F16 + + // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); +} + +#define VDR_Q5_0_Q8_1_MMVQ 2 +#define VDR_Q5_0_Q8_1_MMQ 4 + +template <int vdr> +static __dpct_inline__ float +vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, + const float &d5, const sycl::half2 &ds8) { + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = dpct::dp4a(vi0, u[2 * i + 0], + sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = dpct::dp4a(vi1, u[2 * i + 1], + sumi); // SIMD dot product of quantized values + } + + const sycl::float2 ds8f = + ds8.convert<float, sycl::rounding_mode::automatic>(); + + // second part effectively subtracts 16 from each quant value + return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y()); +} + +#define VDR_Q5_1_Q8_1_MMVQ 2 +#define VDR_Q5_1_Q8_1_MMQ 4 + +template <int vdr> +static __dpct_inline__ float +vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, + const sycl::half2 &dm5, const sycl::half2 &ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = dpct::dp4a(vi0, u[2 * i + 0], + sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = dpct::dp4a(vi1, u[2 * i + 1], + sumi); // SIMD dot product of quantized values + } + +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm5 * ds8).convert<float, sycl::rounding_mode::automatic>(); + const float d5d8 = tmp.x(); + const float m5s8 = tmp.y(); + + +#else + const sycl::float2 dm5f = + dm5.convert<float, sycl::rounding_mode::automatic>(); + const sycl::float2 ds8f = + ds8.convert<float, sycl::rounding_mode::automatic>(); + const float d5d8 = dm5f.x() * ds8f.x(); + const float m5s8 = dm5f.y() * ds8f.y(); +#endif // GGML_SYCL_F16 + + // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it + return sumi*d5d8 + m5s8 / (QI5_1 / vdr); +} + +#define VDR_Q8_0_Q8_1_MMVQ 2 +#define VDR_Q8_0_Q8_1_MMQ 8 + +template <int vdr> +static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u, + const float &d8_0, + const float &d8_1) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = dpct::dp4a(v[i], u[i], sumi); + } + + return d8_0*d8_1 * sumi; +} + +template <int vdr> +static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u, + const sycl::half2 &dm8, + const sycl::half2 &ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = dpct::dp4a(v[i], u[i], sumi); + } + +#ifdef GGML_SYCL_F16 + const sycl::float2 tmp = + (dm8 * ds8).convert<float, sycl::rounding_mode::automatic>(); + const float d8d8 = tmp.x(); + const float m8s8 = tmp.y(); +#else + const sycl::float2 dm8f = + dm8.convert<float, sycl::rounding_mode::automatic>(); + const sycl::float2 ds8f = + ds8.convert<float, sycl::rounding_mode::automatic>(); + const float d8d8 = dm8f.x() * ds8f.x(); + const float m8s8 = dm8f.y() * ds8f.y(); +#endif // GGML_SYCL_F16 + + // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it + return sumi*d8d8 + m8s8 / (QI8_1 / vdr); +} + +static __dpct_inline__ float +vec_dot_q4_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; + + int v[VDR_Q4_0_Q8_1_MMVQ]; + int u[2*VDR_Q4_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + } + + return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds); +} + +static __dpct_inline__ float +vec_dot_q4_1_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; + + int v[VDR_Q4_1_Q8_1_MMVQ]; + int u[2*VDR_Q4_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + } + + return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds); +} + +static __dpct_inline__ float +vec_dot_q5_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; + + int vl[VDR_Q5_0_Q8_1_MMVQ]; + int vh[VDR_Q5_0_Q8_1_MMVQ]; + int u[2*VDR_Q5_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); + vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + } + + return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds); +} + +static __dpct_inline__ float +vec_dot_q5_1_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; + + int vl[VDR_Q5_1_Q8_1_MMVQ]; + int vh[VDR_Q5_1_Q8_1_MMVQ]; + int u[2*VDR_Q5_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); + vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + } + + return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds); +} + +static __dpct_inline__ float +vec_dot_q8_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; + + int v[VDR_Q8_0_Q8_1_MMVQ]; + int u[VDR_Q8_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_int8(bq8_0->qs, iqs + i); + u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + } + + return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, + bq8_1->ds[0]); +} + +static __dpct_inline__ float +vec_dot_q2_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const uint8_t * scales = bq2_K->scales + scale_offset; + + const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + int u[QR2_K]; + float d8[QR2_K]; + +#pragma unroll + for (int i = 0; i < QR2_K; ++ i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds[0]; + } + + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); +} + +static __dpct_inline__ float +vec_dot_q3_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const float d = bq3_K->d; + + const int vl = get_int_from_uint8(bq3_K->qs, iqs); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds[0]; + } + + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); +} + +static __dpct_inline__ float +vec_dot_q4_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#ifndef GGML_QKK_64 + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + + // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); + + // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 + // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 + // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 + // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 + + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + v[0] = q4[0]; + v[1] = q4[4]; + + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int i = 0; i < QR4_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = bq8i->ds[0]; + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); + +#else + +#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + const uint16_t * a = (const uint16_t *)bq4_K->scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; + + const float d8_1 = bq8_1[0].ds[0]; + const float d8_2 = bq8_1[1].ds[1]; + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * q4 = (const int *)bq4_K->qs + (iqs/2); + const int v1 = q4[0]; + const int v2 = q4[4]; + + const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0)); + const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); + const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0)); + const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0)); + + sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); + sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + + return dall * sumf_d - dmin * sumf_m; + +#else + bad_arch(); +#endif // __SYCL_ARCH__ >= VER_4VEC + +#endif +} + +static __dpct_inline__ float +vec_dot_q5_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#ifndef GGML_QKK_64 + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; + + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); + + vl[0] = ql[0]; + vl[1] = ql[4]; + + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = bq8i->ds[0]; + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); + +#else + +#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int8_t * s = bq5_K->scales; + + const float d = bq5_K->d; + + const float d8_1 = bq8_1[0].ds[0]; + const float d8_2 = bq8_1[1].ds[1]; + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * ql = (const int *)bq5_K->qs + (iqs/2); + const int vl1 = ql[0]; + const int vl2 = ql[4]; + + const int step = 4 * (iqs/2); // 0, 4, 8, 12 + const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 + const int in = step%8; // 0, 4, 0, 4 + const int vh = (*((const int *)(bq5_K->qh + in))) >> im; + + const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); + const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); + const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); + const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); + + const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1]) + + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]); + + return d * sumf_d; + +#else + bad_arch(); +#endif // __SYCL_ARCH__ >= VER_4VEC + +#endif +} + +static __dpct_inline__ float +vec_dot_q6_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + const int vl = get_int_from_uint8(bq6_K->ql, iqs); + const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; + + const int8_t * scales = bq6_K->scales + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + 2 * i].ds[0]; + } + + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); +} + + +static __dpct_inline__ float +vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { +#if QK_K == 256 + const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; + + const int ib32 = iqs; + const uint16_t * q2 = bq2->qs + 4*ib32; + const uint8_t * aux8 = (const uint8_t *)q2; + const int8_t * q8 = bq8_1[ib32].qs; + uint32_t aux32 = q2[2] | (q2[3] << 16); + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[aux32 & 127]; + for (int j = 0; j < 8; ++j) { + sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + aux32 >>= 7; + } + const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f; + return d * sumi; +#else + assert(false); + return 0.f; +#endif +} + +static __dpct_inline__ float +vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint64_t *iq2xs_grid, const uint64_t *ksigns64) { +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; + + const int ib32 = iqs; + const uint16_t * q2 = bq2->qs + 4*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); + const int grid_l = dpct::vectorized_binary<sycl::uchar4>( + grid[0] ^ signs[0], signs[0], std::minus<>()); + const int grid_h = dpct::vectorized_binary<sycl::uchar4>( + grid[1] ^ signs[1], signs[1], std::minus<>()); + sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); + const int grid_l = dpct::vectorized_binary<sycl::uchar4>( + grid[0] ^ signs[0], signs[0], std::minus<>()); + const int grid_h = dpct::vectorized_binary<sycl::uchar4>( + grid[1] ^ signs[1], signs[1], std::minus<>()); + sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif +} + +static __dpct_inline__ float +vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { +#if QK_K == 256 + const block_iq2_s * bq2 = (const block_iq2_s *) vbq; + + const int ib32 = iqs; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const int grid_l = dpct::vectorized_binary<sycl::uchar4>( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary<sycl::uchar4>( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const int grid_l = dpct::vectorized_binary<sycl::uchar4>( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary<sycl::uchar4>( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + assert(false); +#endif +} + +static __dpct_inline__ float +vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) { +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; + + const int ib32 = iqs; + const uint8_t * q3 = bq2->qs + 8*ib32; + const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + uint32_t aux32 = gas[0] | (gas[1] << 16); + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0]; + const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1]; + const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127)); + const int grid_l = dpct::vectorized_binary<sycl::uchar4>( + grid1[0] ^ signs[0], signs[0], std::minus<>()); + const int grid_h = dpct::vectorized_binary<sycl::uchar4>( + grid2[0] ^ signs[1], signs[1], std::minus<>()); + sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); + sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); + q8 += 8; + aux32 >>= 7; + } + const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.5f; + return d * sumi; +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif +} + +static __dpct_inline__ float +vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq3s_grid) { +#if QK_K == 256 + const block_iq3_s * bq2 = (const block_iq3_s *) vbq; + + const int ib32 = iqs; + const uint8_t * qs = bq2->qs + 8*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); + const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>( + ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201, + 0x08040201, std::equal_to<>()); + uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>( + ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201, + 0x08040201, std::equal_to<>()); + const int grid_l = dpct::vectorized_binary<sycl::uchar4>( + grid1[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary<sycl::uchar4>( + grid2[0] ^ signs1, signs1, std::minus<>()); + sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); + sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); + q8 += 8; + } + const float d = + (float)bq2->d * + (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) * + bq8_1[ib32].ds[0]; + return d * sumi; +#else + assert(false); +#endif +} + +static __dpct_inline__ float +vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq1s_grid_gpu) { +#if QK_K == 256 + const block_iq1_s * bq1 = (const block_iq1_s *) vbq; + + const int ib32 = iqs; + int sumi = 0; + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi = dpct::dp4a(q8[2 * l + 1], grid1, + dpct::dp4a(q8[2 * l + 0], grid0, sumi)); + } + + const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA; + const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1); + const float d = d1q * bq8_1[ib32].ds[0]; + const float m = d1q * bq8_1[ib32].ds[1]; + return d * sumi + m * delta; +#else + assert(false); +#endif +} + +static __dpct_inline__ float +vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { +#if QK_K == 256 + const block_iq1_m * bq1 = (const block_iq1_m *) vbq; + + const int ib32 = iqs; + int sumi[2] = {0, 0}; + float sumf[2] = {0.f, 0.f}; + + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1, + dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2])); + const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; + const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101, + dpct::dp4a(q8[2 * l + 0], 0x01010101, 0)); + sumf[l/2] += delta*sumy; + } + + iq1m_scale_t scale; + const uint16_t * sc = (const uint16_t *)bq1->scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = (float)scale.f16 * bq8_1[ib32].ds[0]; + return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); +#else + assert(false); +#endif +} + + +static __dpct_inline__ float +vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_iq4_nl * bq = (const block_iq4_nl *) vbq; + + const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; + const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { + const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); + get_int_from_table_16(aux, values, v1, v2); + sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1); + sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2); + } + + const float d = (float)bq->d * bq8_1->ds[0]; + return d * (sumi1 + sumi2); +} + + +static __dpct_inline__ float +vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#if QK_K == 256 + const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + // iqs is 0...7 + const int ib32 = iqs; + const int32_t * q8 = (const int *)bq8_1[ib32].qs; + const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; + const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0]; + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 4; ++j) { + get_int_from_table_16(q4[j], values, v1, v2); + sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1); + sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2); + } + return d * (sumi1 + sumi2); +#else + assert(false); +#endif +} + +#endif // GGML_SYCL_VECDOTQ_HPP |