diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-07-27 07:55:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-27 07:55:01 +0200 |
commit | 154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch) | |
tree | 81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /ggml-sycl/dpct/helper.hpp | |
parent | 0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff) |
Merge mainline llama.cpp (#3)
* Merging mainline - WIP
* Merging mainline - WIP
AVX2 and CUDA appear to work.
CUDA performance seems slightly (~1-2%) lower as it is so often
the case with llama.cpp/ggml after some "improvements" have been made.
* Merging mainline - fix Metal
* Remove check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml-sycl/dpct/helper.hpp')
-rw-r--r-- | ggml-sycl/dpct/helper.hpp | 2936 |
1 files changed, 0 insertions, 2936 deletions
diff --git a/ggml-sycl/dpct/helper.hpp b/ggml-sycl/dpct/helper.hpp deleted file mode 100644 index 1ff29721..00000000 --- a/ggml-sycl/dpct/helper.hpp +++ /dev/null @@ -1,2936 +0,0 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_DPCT_HELPER_HPP -#define GGML_SYCL_DPCT_HELPER_HPP - -#include <sycl/sycl.hpp> -#include <sycl/half_type.hpp> -#include <oneapi/mkl.hpp> -#include <map> - -#include "ggml.h" - -#if defined(__linux__) -#include <sys/mman.h> -#elif defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include <windows.h> -#else -#error "Only support Windows and Linux." -#endif - -#if defined(__linux__) -#include <unistd.h> -#include <sys/syscall.h> -#endif -#if defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include <windows.h> -#endif - -#define DPCT_COMPATIBILITY_TEMP (900) - -#if defined(_MSC_VER) -#define __dpct_align__(n) __declspec(align(n)) -#define __dpct_inline__ __forceinline -#else -#define __dpct_align__(n) __attribute__((aligned(n))) -#define __dpct_inline__ __inline__ __attribute__((always_inline)) -#endif - -#if defined(_MSC_VER) -#define __dpct_noinline__ __declspec(noinline) -#else -#define __dpct_noinline__ __attribute__((noinline)) -#endif - -inline std::string get_device_type_name(const sycl::device &Device) { - auto DeviceType = Device.get_info<sycl::info::device::device_type>(); - switch (DeviceType) { - case sycl::info::device_type::cpu: - return "cpu"; - case sycl::info::device_type::gpu: - return "gpu"; - case sycl::info::device_type::host: - return "host"; - case sycl::info::device_type::accelerator: - return "acc"; - default: - return "unknown"; - } -} - -inline std::string get_device_backend_and_type(const sycl::device &device) { - std::stringstream device_type; - sycl::backend backend = device.get_backend(); - device_type << backend << ":" << get_device_type_name(device); - return device_type.str(); -} - -namespace dpct -{ - typedef sycl::queue *queue_ptr; - typedef sycl::event *event_ptr; - typedef char *device_ptr; - typedef uint8_t byte_t; - typedef sycl::buffer<byte_t> buffer_t; - - /// SYCL default exception handler - inline auto exception_handler = [](sycl::exception_list exceptions) - { - for (std::exception_ptr const &e : exceptions) - { - try - { - std::rethrow_exception(e); - } - catch (sycl::exception const &e) - { - std::cerr << "Caught asynchronous SYCL exception:" << std::endl - << e.what() << std::endl - << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - } - } - }; - - enum error_code - { - success = 0, - default_error = 999 - }; - - enum memcpy_direction - { - host_to_host, - host_to_device, - device_to_host, - device_to_device, - automatic - }; - - enum memory_region - { - global = 0, // device global memory - constant, // device constant memory - local, // device local memory - shared, // memory which can be accessed by host and device - }; - - enum class library_data_t : unsigned char - { - real_float = 0, - complex_float, - real_double, - complex_double, - real_half, - complex_half, - real_bfloat16, - complex_bfloat16, - real_int4, - complex_int4, - real_uint4, - complex_uint4, - real_int8, - complex_int8, - real_uint8, - complex_uint8, - real_int16, - complex_int16, - real_uint16, - complex_uint16, - real_int32, - complex_int32, - real_uint32, - complex_uint32, - real_int64, - complex_int64, - real_uint64, - complex_uint64, - real_int8_4, - real_int8_32, - real_uint8_4, - library_data_t_size - }; - - template <typename T> - struct DataType - { - using T2 = T; - }; - template <typename T> - struct DataType<sycl::vec<T, 2>> - { - using T2 = std::complex<T>; - }; - - static void destroy_event(event_ptr event) - { - delete event; - } - - static inline unsigned int get_tid() - { -#if defined(__linux__) - return syscall(SYS_gettid); -#elif defined(_WIN64) - return GetCurrentThreadId(); -#else -#error "Only support Windows and Linux." -#endif - } - - namespace detail - { - static void get_version(const sycl::device &dev, int &major, int &minor) - { - // Version string has the following format: - // a. OpenCL<space><major.minor><space><vendor-specific-information> - // b. <major.minor> - // c. <AmdGcnArchName> e.g gfx1030 - std::string ver; - ver = dev.get_info<sycl::info::device::version>(); - std::string::size_type i = 0; - while (i < ver.size()) { - if (isdigit(ver[i])) - break; - i++; - } - major = std::stoi(&(ver[i])); - while (i < ver.size()) { - if (ver[i] == '.') - break; - i++; - } - if (i < ver.size()) { - // a. and b. - i++; - minor = std::stoi(&(ver[i])); - } else { - // c. - minor = 0; - } - } - - template <typename tag, typename T> - class generic_error_type - { - public: - generic_error_type() = default; - generic_error_type(T value) : value{value} {} - operator T() const { return value; } - - private: - T value; - }; - - } // namespace detail - - /// Pitched 2D/3D memory data. - class pitched_data - { - public: - pitched_data() : pitched_data(nullptr, 0, 0, 0) {} - pitched_data(void *data, size_t pitch, size_t x, size_t y) - : _data(data), _pitch(pitch), _x(x), _y(y) {} - - void *get_data_ptr() { return _data; } - void set_data_ptr(void *data) { _data = data; } - - size_t get_pitch() { return _pitch; } - void set_pitch(size_t pitch) { _pitch = pitch; } - - size_t get_x() { return _x; } - void set_x(size_t x) { _x = x; }; - - size_t get_y() { return _y; } - void set_y(size_t y) { _y = y; } - - private: - void *_data; - size_t _pitch, _x, _y; - }; - - class device_info - { - public: - // get interface - const char *get_name() const { return _name; } - char *get_name() { return _name; } - template <typename WorkItemSizesTy = sycl::range<3>, - std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> || - std::is_same_v<WorkItemSizesTy, int *>, - int> = 0> - auto get_max_work_item_sizes() const - { - if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>) - return sycl::range<3>(_max_work_item_sizes_i[0], - _max_work_item_sizes_i[1], - _max_work_item_sizes_i[2]); - else - { - return _max_work_item_sizes_i; - } - } - template <typename WorkItemSizesTy = sycl::range<3>, - std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> || - std::is_same_v<WorkItemSizesTy, int *>, - int> = 0> - auto get_max_work_item_sizes() - { - if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>) - return sycl::range<3>(_max_work_item_sizes_i[0], - _max_work_item_sizes_i[1], - _max_work_item_sizes_i[2]); - else - { - return _max_work_item_sizes_i; - } - } - bool get_host_unified_memory() const { return _host_unified_memory; } - int get_major_version() const { return _major; } - int get_minor_version() const { return _minor; } - int get_integrated() const { return _integrated; } - int get_max_clock_frequency() const { return _frequency; } - int get_max_compute_units() const { return _max_compute_units; } - int get_max_work_group_size() const { return _max_work_group_size; } - int get_max_sub_group_size() const { return _max_sub_group_size; } - int get_max_work_items_per_compute_unit() const - { - return _max_work_items_per_compute_unit; - } - int get_max_register_size_per_work_group() const - { - return _max_register_size_per_work_group; - } - template <typename NDRangeSizeTy = size_t *, - std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> || - std::is_same_v<NDRangeSizeTy, int *>, - int> = 0> - auto get_max_nd_range_size() const - { - if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>) - return _max_nd_range_size; - else - return _max_nd_range_size_i; - } - template <typename NDRangeSizeTy = size_t *, - std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> || - std::is_same_v<NDRangeSizeTy, int *>, - int> = 0> - auto get_max_nd_range_size() - { - if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>) - return _max_nd_range_size; - else - return _max_nd_range_size_i; - } - size_t get_global_mem_size() const { return _global_mem_size; } - size_t get_local_mem_size() const { return _local_mem_size; } - size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; } - /// Returns the maximum clock rate of device's global memory in kHz. If - /// compiler does not support this API then returns default value 3200000 kHz. - unsigned int get_memory_clock_rate() const { return _memory_clock_rate; } - /// Returns the maximum bus width between device and memory in bits. If - /// compiler does not support this API then returns default value 64 bits. - unsigned int get_memory_bus_width() const { return _memory_bus_width; } - uint32_t get_device_id() const { return _device_id; } - std::array<unsigned char, 16> get_uuid() const { return _uuid; } - /// Returns global memory cache size in bytes. - unsigned int get_global_mem_cache_size() const - { - return _global_mem_cache_size; - } - - // set interface - void set_name(const char *name) - { - size_t length = strlen(name); - if (length < 256) - { - std::memcpy(_name, name, length + 1); - } - else - { - std::memcpy(_name, name, 255); - _name[255] = '\0'; - } - } - void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) - { - for (int i = 0; i < 3; ++i) - _max_work_item_sizes_i[i] = max_work_item_sizes[i]; - } - [[deprecated]] void - set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) - { - for (int i = 0; i < 3; ++i) - { - _max_work_item_sizes_i[i] = max_work_item_sizes[i]; - } - } - void set_host_unified_memory(bool host_unified_memory) - { - _host_unified_memory = host_unified_memory; - } - void set_major_version(int major) { _major = major; } - void set_minor_version(int minor) { _minor = minor; } - void set_integrated(int integrated) { _integrated = integrated; } - void set_max_clock_frequency(int frequency) { _frequency = frequency; } - void set_max_compute_units(int max_compute_units) - { - _max_compute_units = max_compute_units; - } - void set_global_mem_size(size_t global_mem_size) - { - _global_mem_size = global_mem_size; - } - void set_local_mem_size(size_t local_mem_size) - { - _local_mem_size = local_mem_size; - } - void set_max_mem_alloc_size(size_t max_mem_alloc_size) - { - _max_mem_alloc_size = max_mem_alloc_size; - } - void set_max_work_group_size(int max_work_group_size) - { - _max_work_group_size = max_work_group_size; - } - void set_max_sub_group_size(int max_sub_group_size) - { - _max_sub_group_size = max_sub_group_size; - } - void - set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) - { - _max_work_items_per_compute_unit = max_work_items_per_compute_unit; - } - void set_max_nd_range_size(int max_nd_range_size[]) - { - for (int i = 0; i < 3; i++) - { - _max_nd_range_size[i] = max_nd_range_size[i]; - _max_nd_range_size_i[i] = max_nd_range_size[i]; - } - } - void set_memory_clock_rate(unsigned int memory_clock_rate) - { - _memory_clock_rate = memory_clock_rate; - } - void set_memory_bus_width(unsigned int memory_bus_width) - { - _memory_bus_width = memory_bus_width; - } - void - set_max_register_size_per_work_group(int max_register_size_per_work_group) - { - _max_register_size_per_work_group = max_register_size_per_work_group; - } - void set_device_id(uint32_t device_id) - { - _device_id = device_id; - } - void set_uuid(std::array<unsigned char, 16> uuid) - { - _uuid = std::move(uuid); - } - void set_global_mem_cache_size(unsigned int global_mem_cache_size) - { - _global_mem_cache_size = global_mem_cache_size; - } - - private: - char _name[256]; - int _max_work_item_sizes_i[3]; - bool _host_unified_memory = false; - int _major; - int _minor; - int _integrated = 0; - int _frequency; - // Set estimated value 3200000 kHz as default value. - unsigned int _memory_clock_rate = 3200000; - // Set estimated value 64 bits as default value. - unsigned int _memory_bus_width = 64; - unsigned int _global_mem_cache_size; - int _max_compute_units; - int _max_work_group_size; - int _max_sub_group_size; - int _max_work_items_per_compute_unit; - int _max_register_size_per_work_group; - size_t _global_mem_size; - size_t _local_mem_size; - size_t _max_mem_alloc_size; - size_t _max_nd_range_size[3]; - int _max_nd_range_size_i[3]; - uint32_t _device_id; - std::array<unsigned char, 16> _uuid; - }; - - static int get_major_version(const sycl::device &dev) - { - int major, minor; - detail::get_version(dev, major, minor); - return major; - } - - static int get_minor_version(const sycl::device &dev) - { - int major, minor; - detail::get_version(dev, major, minor); - return minor; - } - - static void get_device_info(device_info &out, const sycl::device &dev) - { - device_info prop; - prop.set_name(dev.get_info<sycl::info::device::name>().c_str()); - - int major, minor; - detail::get_version(dev, major, minor); - prop.set_major_version(major); - prop.set_minor_version(minor); - - prop.set_max_work_item_sizes( -#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902) - // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes - // is an enum class element - dev.get_info<sycl::info::device::max_work_item_sizes>()); -#else - // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by - // an int - dev.get_info<sycl::info::device::max_work_item_sizes<3>>()); -#endif - prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations)); - - prop.set_max_clock_frequency( - dev.get_info<sycl::info::device::max_clock_frequency>() * 1000); - - prop.set_max_compute_units( - dev.get_info<sycl::info::device::max_compute_units>()); - prop.set_max_work_group_size( - dev.get_info<sycl::info::device::max_work_group_size>()); - prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>()); - prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>()); - prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>()); - -#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6) - if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) - { - unsigned int tmp = - dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>(); - if (tmp != 0) - prop.set_memory_clock_rate(1000 * tmp); - } - if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) - { - prop.set_memory_bus_width( - dev.get_info<sycl::ext::intel::info::device::memory_bus_width>()); - } - if (dev.has(sycl::aspect::ext_intel_device_id)) - { - prop.set_device_id( - dev.get_info<sycl::ext::intel::info::device::device_id>()); - } - if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) - { - prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>()); - } -#elif defined(_MSC_VER) && !defined(__clang__) -#pragma message("get_device_info: querying memory_clock_rate and \ - memory_bus_width are not supported by the compiler used. \ - Use 3200000 kHz as memory_clock_rate default value. \ - Use 64 bits as memory_bus_width default value.") -#else -#warning "get_device_info: querying memory_clock_rate and \ - memory_bus_width are not supported by the compiler used. \ - Use 3200000 kHz as memory_clock_rate default value. \ - Use 64 bits as memory_bus_width default value." -#endif - - size_t max_sub_group_size = 1; - std::vector<size_t> sub_group_sizes = - dev.get_info<sycl::info::device::sub_group_sizes>(); - - for (const auto &sub_group_size : sub_group_sizes) - { - if (max_sub_group_size < sub_group_size) - max_sub_group_size = sub_group_size; - } - - prop.set_max_sub_group_size(max_sub_group_size); - - prop.set_max_work_items_per_compute_unit( - dev.get_info<sycl::info::device::max_work_group_size>()); - int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; - prop.set_max_nd_range_size(max_nd_range_size); - - // Estimates max register size per work group, feel free to update the value - // according to device properties. - prop.set_max_register_size_per_work_group(65536); - - prop.set_global_mem_cache_size( - dev.get_info<sycl::info::device::global_mem_cache_size>()); - out = prop; - } - - /// dpct device extension - class device_ext : public sycl::device { - typedef std::mutex mutex_type; - - public: - device_ext() : sycl::device() {} - ~device_ext() { - std::lock_guard<mutex_type> lock(m_mutex); - clear_queues(); - } - device_ext(const sycl::device &base) : sycl::device(base) { - std::lock_guard<mutex_type> lock(m_mutex); - init_queues(); - } - - int is_native_atomic_supported() { return 0; } - int get_major_version() const { return dpct::get_major_version(*this); } - - int get_minor_version() const { return dpct::get_minor_version(*this); } - - int get_max_compute_units() const { - return get_device_info().get_max_compute_units(); - } - - /// Return the maximum clock frequency of this device in KHz. - int get_max_clock_frequency() const { - return get_device_info().get_max_clock_frequency(); - } - - int get_integrated() const { return get_device_info().get_integrated(); } - - int get_max_sub_group_size() const { - return get_device_info().get_max_sub_group_size(); - } - - int get_max_register_size_per_work_group() const { - return get_device_info().get_max_register_size_per_work_group(); - } - - int get_max_work_group_size() const { - return get_device_info().get_max_work_group_size(); - } - - int get_mem_base_addr_align() const { - return get_info<sycl::info::device::mem_base_addr_align>(); - } - - size_t get_global_mem_size() const { - return get_device_info().get_global_mem_size(); - } - - size_t get_max_mem_alloc_size() const { - return get_device_info().get_max_mem_alloc_size(); - } - - /// Get the number of bytes of free and total memory on the SYCL device. - /// \param [out] free_memory The number of bytes of free memory on the - /// SYCL device. \param [out] total_memory The number of bytes of total - /// memory on the SYCL device. - void get_memory_info(size_t &free_memory, size_t &total_memory) { - total_memory = get_device_info().get_global_mem_size(); - const char *warning_info = - "get_memory_info: [warning] ext_intel_free_memory is not " - "supported (export/set ZES_ENABLE_SYSMAN=1 to support), " - "use total memory as free memory"; -#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) - if (!has(sycl::aspect::ext_intel_free_memory)) { - std::cerr << warning_info << std::endl; - free_memory = total_memory; - } else { - free_memory = get_info<sycl::ext::intel::info::device::free_memory>(); - } -#else - std::cerr << warning_info << std::endl; - free_memory = total_memory; -#if defined(_MSC_VER) && !defined(__clang__) -#pragma message("Querying the number of bytes of free memory is not supported") -#else -#warning "Querying the number of bytes of free memory is not supported" -#endif -#endif - } - - void get_device_info(device_info &out) const { - dpct::get_device_info(out, *this); - } - - device_info get_device_info() const { - device_info prop; - dpct::get_device_info(prop, *this); - return prop; - } - - void reset() { - std::lock_guard<mutex_type> lock(m_mutex); - clear_queues(); - init_queues(); - } - - sycl::queue &in_order_queue() { return _q_in_order; } - - sycl::queue &out_of_order_queue() { return _q_out_of_order; } - - sycl::queue &default_queue() { return in_order_queue(); } - - void queues_wait_and_throw() { - std::unique_lock<mutex_type> lock(m_mutex); - lock.unlock(); - for (auto &q : _queues) { - q.wait_and_throw(); - } - // Guard the destruct of current_queues to make sure the ref count is - // safe. - lock.lock(); - } - - sycl::queue create_queue(bool enable_exception_handler = false) { - return create_in_order_queue(enable_exception_handler); - } - - sycl::queue create_queue(sycl::device device, - bool enable_exception_handler = false) { - return create_in_order_queue(device, enable_exception_handler); - } - - sycl::queue create_in_order_queue(bool enable_exception_handler = false) { - std::lock_guard<mutex_type> lock(m_mutex); - return create_queue_impl(enable_exception_handler, - sycl::property::queue::in_order()); - } - - sycl::queue create_in_order_queue(sycl::device device, - bool enable_exception_handler = false) { - std::lock_guard<mutex_type> lock(m_mutex); - return create_queue_impl(device, enable_exception_handler, - sycl::property::queue::in_order()); - } - - sycl::queue create_out_of_order_queue( - bool enable_exception_handler = false) { - std::lock_guard<mutex_type> lock(m_mutex); - return create_queue_impl(enable_exception_handler); - } - - void destroy_queue(sycl::queue queue) { - std::lock_guard<mutex_type> lock(m_mutex); - _queues.clear(); - } - void set_saved_queue(sycl::queue q) { - std::lock_guard<mutex_type> lock(m_mutex); - _saved_queue = q; - } - sycl::queue get_saved_queue() const { - std::lock_guard<mutex_type> lock(m_mutex); - return _saved_queue; - } - - private: - void clear_queues() { _queues.clear(); } - - void init_queues() { - _q_in_order = - create_queue_impl(true, sycl::property::queue::in_order()); - _q_out_of_order = create_queue_impl(true); - _saved_queue = default_queue(); - } - - /// Caller should acquire resource \p m_mutex before calling this - /// function. - template <class... Properties> - sycl::queue create_queue_impl(bool enable_exception_handler, - Properties... properties) { - sycl::async_handler eh = {}; - if (enable_exception_handler) { - eh = exception_handler; - } - auto q = sycl::queue(*this, eh, - sycl::property_list( -#ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), -#endif - properties...)); - _queues.push_back(q); - - return _queues.back(); - } - - template <class... Properties> - sycl::queue create_queue_impl(sycl::device device, - bool enable_exception_handler, - Properties... properties) { - sycl::async_handler eh = {}; - if (enable_exception_handler) { - eh = exception_handler; - } - _queues.push_back( - sycl::queue(device, eh, - sycl::property_list( -#ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), -#endif - properties...))); - - return _queues.back(); - } - - void get_version(int &major, int &minor) const { - detail::get_version(*this, major, minor); - } - sycl::queue _q_in_order, _q_out_of_order; - sycl::queue _saved_queue; - std::vector<sycl::queue> _queues; - mutable mutex_type m_mutex; - }; - - - /// device manager - class dev_mgr - { - public: - device_ext ¤t_device() - { - unsigned int dev_id = current_device_id(); - check_id(dev_id); - return *_devs[dev_id]; - } - device_ext &cpu_device() const - { - std::lock_guard<std::recursive_mutex> lock(m_mutex); - if (_cpu_device == -1) - { - throw std::runtime_error("no valid cpu device"); - } - else - { - return *_devs[_cpu_device]; - } - } - device_ext &get_device(unsigned int id) const - { - std::lock_guard<std::recursive_mutex> lock(m_mutex); - check_id(id); - return *_devs[id]; - } - unsigned int current_device_id() const - { - std::lock_guard<std::recursive_mutex> lock(m_mutex); - auto it = _thread2dev_map.find(get_tid()); - if (it != _thread2dev_map.end()) - return it->second; - return DEFAULT_DEVICE_ID; - } - - /// Select device with a device ID. - /// \param [in] id The id of the device which can - /// be obtained through get_device_id(const sycl::device). - void select_device(unsigned int id) - { - std::lock_guard<std::recursive_mutex> lock(m_mutex); - check_id(id); - _thread2dev_map[get_tid()] = id; - } - unsigned int device_count() { return _devs.size(); } - - unsigned int get_device_id(const sycl::device &dev) - { - unsigned int id = 0; - for (auto dev_item : _devs) - { - if (*dev_item == dev) - { - break; - } - id++; - } - return id; - } - - template <class DeviceSelector> - std::enable_if_t< - std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>> - select_device(const DeviceSelector &selector = sycl::gpu_selector_v) - { - sycl::device selected_device = sycl::device(selector); - unsigned int selected_device_id = get_device_id(selected_device); - select_device(selected_device_id); - } - - /// Returns the instance of device manager singleton. - static dev_mgr &instance() - { - static dev_mgr d_m; - return d_m; - } - dev_mgr(const dev_mgr &) = delete; - dev_mgr &operator=(const dev_mgr &) = delete; - dev_mgr(dev_mgr &&) = delete; - dev_mgr &operator=(dev_mgr &&) = delete; - - private: - mutable std::recursive_mutex m_mutex; - static bool compare_dev(sycl::device &device1, sycl::device &device2) - { - sycl::backend backend1 = device1.get_backend(); - sycl::backend backend2 = device2.get_backend(); - // levelzero backends always come first - if(backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true; - if(backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false; - dpct::device_info prop1; - dpct::get_device_info(prop1, device1); - dpct::device_info prop2; - dpct::get_device_info(prop2, device2); - return prop1.get_max_compute_units() > prop2.get_max_compute_units(); - } - static int convert_backend_index(std::string & backend) { - if (backend == "ext_oneapi_level_zero:gpu") return 0; - if (backend == "opencl:gpu") return 1; - if (backend == "ext_oneapi_cuda:gpu") return 2; - if (backend == "ext_oneapi_hip:gpu") return 3; - if (backend == "opencl:cpu") return 4; - if (backend == "opencl:acc") return 5; - printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); - GGML_ASSERT(false); - } - static bool compare_backend(std::string &backend1, std::string &backend2) { - return convert_backend_index(backend1) < convert_backend_index(backend2); - } - dev_mgr() - { - sycl::device default_device = - sycl::device(sycl::default_selector_v); - _devs.push_back(std::make_shared<device_ext>(default_device)); - - std::vector<sycl::device> sycl_all_devs; - // Collect other devices except for the default device. - if (default_device.is_cpu()) - _cpu_device = 0; - - auto Platforms = sycl::platform::get_platforms(); - // Keep track of the number of devices per backend - std::map<sycl::backend, size_t> DeviceNums; - std::map<std::string, std::vector<sycl::device>> backend_devices; - - while (!Platforms.empty()) { - auto Platform = Platforms.back(); - Platforms.pop_back(); - auto devices = Platform.get_devices(); - std::string backend_type = get_device_backend_and_type(devices[0]); - for (const auto &device : devices) { - backend_devices[backend_type].push_back(device); - } - } - - std::vector<std::string> keys; - for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) { - keys.push_back(it->first); - } - std::sort(keys.begin(), keys.end(), compare_backend); - - for (auto &key : keys) { - std::vector<sycl::device> devs = backend_devices[key]; - std::sort(devs.begin(), devs.end(), compare_dev); - for (const auto &dev : devs) { - sycl_all_devs.push_back(dev); - } - } - - for (auto &dev : sycl_all_devs) - { - if (dev == default_device) - { - continue; - } - _devs.push_back(std::make_shared<device_ext>(dev)); - if (_cpu_device == -1 && dev.is_cpu()) - { - _cpu_device = _devs.size() - 1; - } - } - } - void check_id(unsigned int id) const - { - if (id >= _devs.size()) - { - throw std::runtime_error("invalid device id"); - } - } - std::vector<std::shared_ptr<device_ext>> _devs; - /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current - /// thread id in _thread2dev_map, which means default device should be used - /// for the current thread. - const unsigned int DEFAULT_DEVICE_ID = 0; - /// thread-id to device-id map. - std::map<unsigned int, unsigned int> _thread2dev_map; - int _cpu_device = -1; - }; - - static inline sycl::queue &get_default_queue() - { - return dev_mgr::instance().current_device().default_queue(); - } - - namespace detail - { - enum class pointer_access_attribute - { - host_only = 0, - device_only, - host_device, - end - }; - - static pointer_access_attribute get_pointer_attribute(sycl::queue &q, - const void *ptr) - { - switch (sycl::get_pointer_type(ptr, q.get_context())) - { - case sycl::usm::alloc::unknown: - return pointer_access_attribute::host_only; - case sycl::usm::alloc::device: - return pointer_access_attribute::device_only; - case sycl::usm::alloc::shared: - case sycl::usm::alloc::host: - return pointer_access_attribute::host_device; - } - } - - template <typename ArgT> - inline constexpr std::uint64_t get_type_combination_id(ArgT Val) - { - static_assert((unsigned char)library_data_t::library_data_t_size <= - std::numeric_limits<unsigned char>::max() && - "library_data_t size exceeds limit."); - static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT"); - return (std::uint64_t)Val; - } - - template <typename FirstT, typename... RestT> - inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal, - RestT... RestVal) - { - static_assert((std::uint8_t)library_data_t::library_data_t_size <= - std::numeric_limits<unsigned char>::max() && - "library_data_t size exceeds limit."); - static_assert(sizeof...(RestT) <= 8 && "Too many parameters"); - static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT"); - return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal); - } - - class mem_mgr - { - mem_mgr() - { - // Reserved address space, no real memory allocation happens here. -#if defined(__linux__) - mapped_address_space = - (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -#elif defined(_WIN64) - mapped_address_space = (byte_t *)VirtualAlloc( - NULL, // NULL specified as the base address parameter - mapped_region_size, // Size of allocation - MEM_RESERVE, // Allocate reserved pages - PAGE_NOACCESS); // Protection = no access -#else -#error "Only support Windows and Linux." -#endif - next_free = mapped_address_space; - }; - - public: - using buffer_id_t = int; - - struct allocation - { - buffer_t buffer; - byte_t *alloc_ptr; - size_t size; - }; - - ~mem_mgr() - { -#if defined(__linux__) - munmap(mapped_address_space, mapped_region_size); -#elif defined(_WIN64) - VirtualFree(mapped_address_space, 0, MEM_RELEASE); -#else -#error "Only support Windows and Linux." -#endif - }; - - mem_mgr(const mem_mgr &) = delete; - mem_mgr &operator=(const mem_mgr &) = delete; - mem_mgr(mem_mgr &&) = delete; - mem_mgr &operator=(mem_mgr &&) = delete; - - /// Allocate - void *mem_alloc(size_t size) - { - if (!size) - return nullptr; - std::lock_guard<std::mutex> lock(m_mutex); - if (next_free + size > mapped_address_space + mapped_region_size) - { - throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool"); - } - // Allocation - sycl::range<1> r(size); - buffer_t buf(r); - allocation A{buf, next_free, size}; - // Map allocation to device pointer - void *result = next_free; - m_map.emplace(next_free + size, A); - // Update pointer to the next free space. - next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1); - - return result; - } - - /// Deallocate - void mem_free(const void *ptr) - { - if (!ptr) - return; - std::lock_guard<std::mutex> lock(m_mutex); - auto it = get_map_iterator(ptr); - m_map.erase(it); - } - - /// map: device pointer -> allocation(buffer, alloc_ptr, size) - allocation translate_ptr(const void *ptr) - { - std::lock_guard<std::mutex> lock(m_mutex); - auto it = get_map_iterator(ptr); - return it->second; - } - - /// Check if the pointer represents device pointer or not. - bool is_device_ptr(const void *ptr) const - { - std::lock_guard<std::mutex> lock(m_mutex); - return (mapped_address_space <= ptr) && - (ptr < mapped_address_space + mapped_region_size); - } - - /// Returns the instance of memory manager singleton. - static mem_mgr &instance() - { - static mem_mgr m; - return m; - } - - private: - std::map<byte_t *, allocation> m_map; - mutable std::mutex m_mutex; - byte_t *mapped_address_space; - byte_t *next_free; - const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024; - const size_t alignment = 256; - /// This padding may be defined to some positive value to debug - /// out of bound accesses. - const size_t extra_padding = 0; - - std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr) - { - auto it = m_map.upper_bound((byte_t *)ptr); - if (it == m_map.end()) - { - // Not a virtual pointer. - throw std::runtime_error("can not get buffer from non-virtual pointer"); - } - const allocation &alloc = it->second; - if (ptr < alloc.alloc_ptr) - { - // Out of bound. - // This may happen if there's a gap between allocations due to alignment - // or extra padding and pointer points to this gap. - throw std::runtime_error("invalid virtual pointer"); - } - return it; - } - }; - - template <class T, memory_region Memory, size_t Dimension> - class accessor; - template <memory_region Memory, class T = byte_t> - class memory_traits - { - public: - static constexpr sycl::access::target target = - sycl::access::target::device; - static constexpr sycl::access_mode mode = - (Memory == constant) ? sycl::access_mode::read - : sycl::access_mode::read_write; - static constexpr size_t type_size = sizeof(T); - using element_t = - typename std::conditional<Memory == constant, const T, T>::type; - using value_t = typename std::remove_cv<T>::type; - template <size_t Dimension = 1> - using accessor_t = typename std::conditional< - Memory == local, sycl::local_accessor<value_t, Dimension>, - sycl::accessor<T, Dimension, mode, target>>::type; - using pointer_t = T *; - }; - - static inline void *dpct_malloc(size_t size, sycl::queue &q) - { - return sycl::malloc_device(size, q.get_device(), q.get_context()); - } - -#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F)) - static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z, - sycl::queue &q) - { - pitch = PITCH_DEFAULT_ALIGN(x); - return dpct_malloc(pitch * y * z, q); - } - - /** - * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q. - * @tparam valueT The type of the element to be set. - * @param [in] q The queue in which the operation is done. - * @param [in] dev_ptr Pointer to the virtual device memory address. - * @param [in] value The value to be set. - * @param [in] size Number of elements to be set to the value. - * @return An event representing the memset operation. - */ - template <typename valueT> - static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr, - valueT value, size_t size) - { - return q.fill(dev_ptr, value, size); - } - - /** - * @brief Sets \p value to the 3D memory region pointed by \p data in \p q. - * @tparam valueT The type of the element to be set. - * @param [in] q The queue in which the operation is done. - * @param [in] data Pointer to the pitched device memory region. - * @param [in] value The value to be set. - * @param [in] size 3D memory region by number of elements. - * @return An event list representing the memset operations. - */ - template <typename valueT> - static inline std::vector<sycl::event> - dpct_memset(sycl::queue &q, pitched_data data, valueT value, - sycl::range<3> size) - { - std::vector<sycl::event> event_list; - size_t slice = data.get_pitch() * data.get_y(); - unsigned char *data_surface = (unsigned char *)data.get_data_ptr(); - for (size_t z = 0; z < size.get(2); ++z) - { - unsigned char *data_ptr = data_surface; - for (size_t y = 0; y < size.get(1); ++y) - { - event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0))); - data_ptr += data.get_pitch(); - } - data_surface += slice; - } - return event_list; - } - - /** - * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q. - * @tparam valueT The type of the element to be set. - * @param [in] q The queue in which the operation is done. - * @param [in] ptr Pointer to the virtual device memory. - * @param [in] pitch The pitch size by number of elements, including padding. - * @param [in] val The value to be set. - * @param [in] x The width of memory region by number of elements. - * @param [in] y The height of memory region by number of elements. - * @return An event list representing the memset operations. - */ - template <typename valueT> - static inline std::vector<sycl::event> - dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x, - size_t y) - { - return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val, - sycl::range<3>(x, y, 1)); - } - - static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr, - const void *from_ptr, - memcpy_direction dir) - { - switch (dir) - { - case memcpy_direction::host_to_host: - case memcpy_direction::host_to_device: - case memcpy_direction::device_to_host: - case memcpy_direction::device_to_device: - return dir; - case memcpy_direction::automatic: - { - // table[to_attribute][from_attribute] - static const memcpy_direction - direction_table[static_cast<unsigned>(pointer_access_attribute::end)] - [static_cast<unsigned>(pointer_access_attribute::end)] = - {{memcpy_direction::host_to_host, - memcpy_direction::device_to_host, - memcpy_direction::host_to_host}, - {memcpy_direction::host_to_device, - memcpy_direction::device_to_device, - memcpy_direction::device_to_device}, - {memcpy_direction::host_to_host, - memcpy_direction::device_to_device, - memcpy_direction::device_to_device}}; - return direction_table[static_cast<unsigned>(get_pointer_attribute( - q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))]; - } - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } - } - - static sycl::event - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, - memcpy_direction direction, - const std::vector<sycl::event> &dep_events = {}) - { - if (!size) - return sycl::event{}; - return q.memcpy(to_ptr, from_ptr, size, dep_events); - GGML_UNUSED(direction); - } - - // Get actual copy range and make sure it will not exceed range. - static inline size_t get_copy_range(sycl::range<3> size, size_t slice, - size_t pitch) - { - return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); - } - - static inline size_t get_offset(sycl::id<3> id, size_t slice, - size_t pitch) - { - return slice * id.get(2) + pitch * id.get(1) + id.get(0); - } - - /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr - /// and \p from_range to another specified by \p to_ptr and \p to_range. - static inline std::vector<sycl::event> - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, - sycl::range<3> to_range, sycl::range<3> from_range, - sycl::id<3> to_id, sycl::id<3> from_id, - sycl::range<3> size, memcpy_direction direction, - const std::vector<sycl::event> &dep_events = {}) - { - // RAII for host pointer - class host_buffer - { - void *_buf; - size_t _size; - sycl::queue &_q; - const std::vector<sycl::event> &_deps; // free operation depends - - public: - host_buffer(size_t size, sycl::queue &q, - const std::vector<sycl::event> &deps) - : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} - void *get_ptr() const { return _buf; } - size_t get_size() const { return _size; } - ~host_buffer() - { - if (_buf) - { - _q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(_deps); - cgh.host_task([buf = _buf] { std::free(buf); }); }); - } - } - }; - std::vector<sycl::event> event_list; - - size_t to_slice = to_range.get(1) * to_range.get(0), - from_slice = from_range.get(1) * from_range.get(0); - unsigned char *to_surface = - (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); - const unsigned char *from_surface = - (const unsigned char *)from_ptr + - get_offset(from_id, from_slice, from_range.get(0)); - - if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) - { - return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), - direction, dep_events)}; - } - direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - size_t size_slice = size.get(1) * size.get(0); - switch (direction) - { - case host_to_host: - for (size_t z = 0; z < size.get(2); ++z) - { - unsigned char *to_ptr = to_surface; - const unsigned char *from_ptr = from_surface; - if (to_range.get(0) == from_range.get(0) && - to_range.get(0) == size.get(0)) - { - event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, - direction, dep_events)); - } - else - { - for (size_t y = 0; y < size.get(1); ++y) - { - event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), - direction, dep_events)); - to_ptr += to_range.get(0); - from_ptr += from_range.get(0); - } - } - to_surface += to_slice; - from_surface += from_slice; - } - break; - case host_to_device: - { - host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, - event_list); - std::vector<sycl::event> host_events; - if (to_slice == size_slice) - { - // Copy host data to a temp host buffer with the shape of target. - host_events = - dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, - host_to_host, dep_events); - } - else - { - // Copy host data to a temp host buffer with the shape of target. - host_events = dpct_memcpy( - q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, - // If has padding data, not sure whether it is useless. So fill temp - // buffer with it. - std::vector<sycl::event>{ - dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), - device_to_host, dep_events)}); - } - // Copy from temp host buffer to device with only one submit. - event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), - buf.get_size(), host_to_device, - host_events)); - break; - } - case device_to_host: - { - host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, - event_list); - // Copy from host temp buffer to host target with reshaping. - event_list = dpct_memcpy( - q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), - sycl::id<3>(0, 0, 0), size, host_to_host, - // Copy from device to temp host buffer with only one submit. - std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface, - buf.get_size(), - device_to_host, dep_events)}); - break; - } - case device_to_device: - event_list.push_back(q.submit([&](sycl::handler &cgh){ - cgh.depends_on(dep_events); - cgh.parallel_for<class dpct_memcpy_3d_detail>( - size, - [=](sycl::id<3> id) { - to_surface[get_offset(id, to_slice, to_range.get(0))] = - from_surface[get_offset(id, from_slice, from_range.get(0))]; - }); })); - break; - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } - return event_list; - } - - /// memcpy 2D/3D matrix specified by pitched_data. - static inline std::vector<sycl::event> - dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, - pitched_data from, sycl::id<3> from_id, sycl::range<3> size, - memcpy_direction direction = automatic) - { - return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), - sycl::range<3>(to.get_pitch(), to.get_y(), 1), - sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, - size, direction); - } - - /// memcpy 2D matrix with pitch. - static inline std::vector<sycl::event> - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, - size_t to_pitch, size_t from_pitch, size_t x, size_t y, - memcpy_direction direction = automatic) - { - return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), - sycl::range<3>(from_pitch, y, 1), - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), - sycl::range<3>(x, y, 1), direction); - } - - namespace deprecated - { - - template <typename T, sycl::usm::alloc AllocKind> - class usm_allocator - { - private: - using Alloc = sycl::usm_allocator<T, AllocKind>; - Alloc _impl; - - public: - using value_type = typename std::allocator_traits<Alloc>::value_type; - using pointer = typename std::allocator_traits<Alloc>::pointer; - using const_pointer = typename std::allocator_traits<Alloc>::const_pointer; - using void_pointer = typename std::allocator_traits<Alloc>::void_pointer; - using const_void_pointer = - typename std::allocator_traits<Alloc>::const_void_pointer; - using reference = typename std::allocator_traits<Alloc>::value_type &; - using const_reference = - const typename std::allocator_traits<Alloc>::value_type &; - using difference_type = - typename std::allocator_traits<Alloc>::difference_type; - using size_type = typename std::allocator_traits<Alloc>::size_type; - using propagate_on_container_copy_assignment = typename std::allocator_traits< - Alloc>::propagate_on_container_copy_assignment; - using propagate_on_container_move_assignment = typename std::allocator_traits< - Alloc>::propagate_on_container_move_assignment; - using propagate_on_container_swap = - typename std::allocator_traits<Alloc>::propagate_on_container_swap; - using is_always_equal = - typename std::allocator_traits<Alloc>::is_always_equal; - - template <typename U> - struct rebind - { - typedef usm_allocator<U, AllocKind> other; - }; - - usm_allocator() : _impl(dpct::get_default_queue()) {} - ~usm_allocator() {} - usm_allocator(const usm_allocator &other) : _impl(other._impl) {} - usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {} - pointer address(reference r) { return &r; } - const_pointer address(const_reference r) { return &r; } - pointer allocate(size_type cnt, const_void_pointer hint = nullptr) - { - return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint); - } - void deallocate(pointer p, size_type cnt) - { - std::allocator_traits<Alloc>::deallocate(_impl, p, cnt); - } - size_type max_size() const - { - return std::allocator_traits<Alloc>::max_size(_impl); - } - bool operator==(const usm_allocator &other) const { return _impl == other._impl; } - bool operator!=(const usm_allocator &other) const { return _impl != other._impl; } - }; - - } // namespace deprecated - - inline void dpct_free(void *ptr, - const sycl::queue &q) - { - if (ptr) - { - sycl::free(ptr, q.get_context()); - } - } - - template <typename T> - inline auto get_memory(const void *x) - { - T *new_x = reinterpret_cast<T *>(const_cast<void *>(x)); - return new_x; - } - - template <typename T> - inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q) - { - using Ty = typename DataType<T>::T2; - Ty s_h; - if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only) - detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host) - .wait(); - else - s_h = *reinterpret_cast<const Ty *>(s); - return s_h; - } - - } // namespace detail - - template <typename T> - inline auto get_value(const T *s, sycl::queue &q) - { - return detail::get_value(s, q); - } - - namespace detail - { - template <class Ta, class Tb, class Tc, class Ts> - inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, - oneapi::mkl::transpose b_trans, int m, int n, int k, - const void *alpha, const void *a, int lda, const void *b, - int ldb, const void *beta, void *c, int ldc) - { - Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); - Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); - auto data_a = get_memory<const Ta>(a); - auto data_b = get_memory<const Tb>(b); - auto data_c = get_memory<Tc>(c); - oneapi::mkl::blas::column_major::gemm( - q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, - data_b, ldb, beta_value, data_c, ldc); - } - - template <typename VecT, class BinaryOperation, class = void> - class vectorized_binary - { - public: - inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) - { - VecT v4; - for (size_t i = 0; i < v4.size(); ++i) - { - v4[i] = binary_op(a[i], b[i]); - } - return v4; - } - }; - - template <typename VecT, class BinaryOperation> - class vectorized_binary< - VecT, BinaryOperation, - std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>> - { - public: - inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) - { - return binary_op(a, b).template as<VecT>(); - } - }; - - template <class Ta, class Tb, class Tc, class Ts> - inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, - oneapi::mkl::transpose b_trans, int m, int n, int k, - const void *alpha, const void **a, int lda, - const void **b, int ldb, const void *beta, void **c, - int ldc, int batch_size) - { - struct matrix_info_t - { - oneapi::mkl::transpose transpose_info[2]; - Ts value_info[2]; - std::int64_t size_info[3]; - std::int64_t ld_info[3]; - std::int64_t groupsize_info; - }; - - Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); - Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); - - matrix_info_t *matrix_info = - (matrix_info_t *)std::malloc(sizeof(matrix_info_t)); - matrix_info->transpose_info[0] = a_trans; - matrix_info->transpose_info[1] = b_trans; - matrix_info->value_info[0] = alpha_value; - matrix_info->value_info[1] = beta_value; - matrix_info->size_info[0] = m; - matrix_info->size_info[1] = n; - matrix_info->size_info[2] = k; - matrix_info->ld_info[0] = lda; - matrix_info->ld_info[1] = ldb; - matrix_info->ld_info[2] = ldc; - matrix_info->groupsize_info = batch_size; - - sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( - q, matrix_info->transpose_info, matrix_info->transpose_info + 1, - matrix_info->size_info, matrix_info->size_info + 1, - matrix_info->size_info + 2, matrix_info->value_info, - reinterpret_cast<const Ta **>(a), matrix_info->ld_info, - reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1, - matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), - matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); - - q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(e); - cgh.host_task([=] { std::free(matrix_info); }); }); - } - - template <class Ta, class Tb, class Tc, class Ts> - inline void - gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, - oneapi::mkl::transpose b_trans, int m, int n, - int k, const void *alpha, const void *a, int lda, - long long int stride_a, const void *b, int ldb, - long long int stride_b, const void *beta, void *c, - int ldc, long long int stride_c, int batch_size) - { - Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); - Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); - auto data_a = get_memory<const Ta>(a); - auto data_b = get_memory<const Tb>(b); - auto data_c = get_memory<Tc>(c); - oneapi::mkl::blas::column_major::gemm_batch( - q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, - stride_a, data_b, ldb, stride_b, beta_value, - data_c, ldc, stride_c, batch_size); - } - - } // namespace detail - - template <typename VecT, class BinaryOperation> - inline unsigned vectorized_binary(unsigned a, unsigned b, - const BinaryOperation binary_op) - { - sycl::vec<unsigned, 1> v0{a}, v1{b}; - auto v2 = v0.as<VecT>(); - auto v3 = v1.as<VecT>(); - auto v4 = - detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op); - v0 = v4.template as<sycl::vec<unsigned, 1>>(); - return v0; - } - - static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size, - memcpy_direction direction = automatic, - sycl::queue &q = dpct::get_default_queue()) - { - detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction); - } - - static inline unsigned int select_device(unsigned int id) - { - dev_mgr::instance().select_device(id); - return id; - } - - template <typename T> - T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, - unsigned int logical_sub_group_size = 32) - { - unsigned int id = g.get_local_linear_id(); - unsigned int start_index = - id / logical_sub_group_size * logical_sub_group_size; - unsigned int target_offset = (id % logical_sub_group_size) ^ mask; - return sycl::select_from_group(g, x, - target_offset < logical_sub_group_size - ? start_index + target_offset - : id); - } - - template <typename T> - sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) - { - return sycl::vec<T, 1>(val) - .template as<sycl::vec< - std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>() - .template convert<T>(); - } - - template <typename T1, typename T2> - using dot_product_acc_t = - std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>, - uint32_t, int32_t>; - - template <typename T1, typename T2, typename T3> - inline auto dp4a(T1 a, T2 b, T3 c) - { - dot_product_acc_t<T1, T2> res = c; - auto va = extract_and_sign_or_zero_extend4(a); - auto vb = extract_and_sign_or_zero_extend4(b); - res += va[0] * vb[0]; - res += va[1] * vb[1]; - res += va[2] * vb[2]; - res += va[3] * vb[3]; - return res; - } - - struct sub_sat - { - template <typename T> - auto operator()(const T x, const T y) const - { - return sycl::sub_sat(x, y); - } - }; - - template <typename S, typename T> - inline T vectorized_min(T a, T b) - { - sycl::vec<T, 1> v0{a}, v1{b}; - auto v2 = v0.template as<S>(); - auto v3 = v1.template as<S>(); - auto v4 = sycl::min(v2, v3); - v0 = v4.template as<sycl::vec<T, 1>>(); - return v0; - } - - inline float pow(const float a, const int b) { return sycl::pown(a, b); } - inline double pow(const double a, const int b) { return sycl::pown(a, b); } - inline float pow(const float a, const float b) { return sycl::pow(a, b); } - inline double pow(const double a, const double b) { return sycl::pow(a, b); } - template <typename T, typename U> - inline typename std::enable_if_t<std::is_floating_point_v<T>, T> - pow(const T a, const U b) - { - return sycl::pow(a, static_cast<T>(b)); - } - template <typename T, typename U> - inline typename std::enable_if_t<!std::is_floating_point_v<T>, double> - pow(const T a, const U b) - { - return sycl::pow(static_cast<double>(a), static_cast<double>(b)); - } - - inline double min(const double a, const float b) - { - return sycl::fmin(a, static_cast<double>(b)); - } - inline double min(const float a, const double b) - { - return sycl::fmin(static_cast<double>(a), b); - } - inline float min(const float a, const float b) { return sycl::fmin(a, b); } - inline double min(const double a, const double b) { return sycl::fmin(a, b); } - inline std::uint32_t min(const std::uint32_t a, const std::int32_t b) - { - return sycl::min(a, static_cast<std::uint32_t>(b)); - } - inline std::uint32_t min(const std::int32_t a, const std::uint32_t b) - { - return sycl::min(static_cast<std::uint32_t>(a), b); - } - inline std::int32_t min(const std::int32_t a, const std::int32_t b) - { - return sycl::min(a, b); - } - inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b) - { - return sycl::min(a, b); - } - inline std::uint64_t min(const std::uint64_t a, const std::int64_t b) - { - return sycl::min(a, static_cast<std::uint64_t>(b)); - } - inline std::uint64_t min(const std::int64_t a, const std::uint64_t b) - { - return sycl::min(static_cast<std::uint64_t>(a), b); - } - inline std::int64_t min(const std::int64_t a, const std::int64_t b) - { - return sycl::min(a, b); - } - inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b) - { - return sycl::min(a, b); - } - inline std::uint64_t min(const std::uint64_t a, const std::int32_t b) - { - return sycl::min(a, static_cast<std::uint64_t>(b)); - } - inline std::uint64_t min(const std::int32_t a, const std::uint64_t b) - { - return sycl::min(static_cast<std::uint64_t>(a), b); - } - inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b) - { - return sycl::min(a, static_cast<std::uint64_t>(b)); - } - inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b) - { - return sycl::min(static_cast<std::uint64_t>(a), b); - } - // max function overloads. - // For floating-point types, `float` or `double` arguments are acceptable. - // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or - // `std::int64_t` type arguments are acceptable. - inline double max(const double a, const float b) - { - return sycl::fmax(a, static_cast<double>(b)); - } - inline double max(const float a, const double b) - { - return sycl::fmax(static_cast<double>(a), b); - } - inline float max(const float a, const float b) { return sycl::fmax(a, b); } - inline double max(const double a, const double b) { return sycl::fmax(a, b); } - inline std::uint32_t max(const std::uint32_t a, const std::int32_t b) - { - return sycl::max(a, static_cast<std::uint32_t>(b)); - } - inline std::uint32_t max(const std::int32_t a, const std::uint32_t b) - { - return sycl::max(static_cast<std::uint32_t>(a), b); - } - inline std::int32_t max(const std::int32_t a, const std::int32_t b) - { - return sycl::max(a, b); - } - inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b) - { - return sycl::max(a, b); - } - inline std::uint64_t max(const std::uint64_t a, const std::int64_t b) - { - return sycl::max(a, static_cast<std::uint64_t>(b)); - } - inline std::uint64_t max(const std::int64_t a, const std::uint64_t b) - { - return sycl::max(static_cast<std::uint64_t>(a), b); - } - inline std::int64_t max(const std::int64_t a, const std::int64_t b) - { - return sycl::max(a, b); - } - inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b) - { - return sycl::max(a, b); - } - inline std::uint64_t max(const std::uint64_t a, const std::int32_t b) - { - return sycl::max(a, static_cast<std::uint64_t>(b)); - } - inline std::uint64_t max(const std::int32_t a, const std::uint64_t b) - { - return sycl::max(static_cast<std::uint64_t>(a), b); - } - inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b) - { - return sycl::max(a, static_cast<std::uint64_t>(b)); - } - inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b) - { - return sycl::max(static_cast<std::uint64_t>(a), b); - } - - inline void - has_capability_or_fail(const sycl::device &dev, - const std::initializer_list<sycl::aspect> &props) - { - for (const auto &it : props) - { - if (dev.has(it)) - continue; - switch (it) - { - case sycl::aspect::fp64: - throw std::runtime_error("'double' is not supported in '" + - dev.get_info<sycl::info::device::name>() + - "' device"); - break; - case sycl::aspect::fp16: - throw std::runtime_error("'half' is not supported in '" + - dev.get_info<sycl::info::device::name>() + - "' device"); - break; - default: -#define __SYCL_ASPECT(ASPECT, ID) \ - case sycl::aspect::ASPECT: \ - return #ASPECT; -#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) -#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) - auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string - { - switch (AspectNum) - { -#include <sycl/info/aspects.def> -#include <sycl/info/aspects_deprecated.def> - default: - return "unknown aspect"; - } - }; -#undef __SYCL_ASPECT_DEPRECATED_ALIAS -#undef __SYCL_ASPECT_DEPRECATED -#undef __SYCL_ASPECT - throw std::runtime_error( - "'" + getAspectNameStr(it) + "' is not supported in '" + - dev.get_info<sycl::info::device::name>() + "' device"); - } - break; - } - } - - static inline unsigned int get_current_device_id() - { - return dev_mgr::instance().current_device_id(); - } - - static inline device_ext &get_current_device() - { - return dev_mgr::instance().current_device(); - } - - static inline sycl::queue &get_in_order_queue() - { - return dev_mgr::instance().current_device().in_order_queue(); - } - - static sycl::event - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, - memcpy_direction direction, - const std::vector<sycl::event> &dep_events = {}) - { - if (!size) - return sycl::event{}; - return q.memcpy(to_ptr, from_ptr, size, dep_events); - GGML_UNUSED(direction); - } - - // Get actual copy range and make sure it will not exceed range. - static inline size_t get_copy_range(sycl::range<3> size, size_t slice, - size_t pitch) - { - return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); - } - - static inline size_t get_offset(sycl::id<3> id, size_t slice, - size_t pitch) - { - return slice * id.get(2) + pitch * id.get(1) + id.get(0); - } - - /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr - /// and \p from_range to another specified by \p to_ptr and \p to_range. - static inline std::vector<sycl::event> - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, - sycl::range<3> to_range, sycl::range<3> from_range, - sycl::id<3> to_id, sycl::id<3> from_id, - sycl::range<3> size, memcpy_direction direction, - const std::vector<sycl::event> &dep_events = {}) - { - // RAII for host pointer - class host_buffer - { - void *_buf; - size_t _size; - sycl::queue &_q; - const std::vector<sycl::event> &_deps; // free operation depends - - public: - host_buffer(size_t size, sycl::queue &q, - const std::vector<sycl::event> &deps) - : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} - void *get_ptr() const { return _buf; } - size_t get_size() const { return _size; } - ~host_buffer() - { - if (_buf) - { - _q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(_deps); - cgh.host_task([buf = _buf] { std::free(buf); }); }); - } - } - }; - std::vector<sycl::event> event_list; - - size_t to_slice = to_range.get(1) * to_range.get(0), - from_slice = from_range.get(1) * from_range.get(0); - unsigned char *to_surface = - (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); - const unsigned char *from_surface = - (const unsigned char *)from_ptr + - get_offset(from_id, from_slice, from_range.get(0)); - - if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) - { - return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), - direction, dep_events)}; - } - direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - size_t size_slice = size.get(1) * size.get(0); - switch (direction) - { - case host_to_host: - for (size_t z = 0; z < size.get(2); ++z) - { - unsigned char *to_ptr = to_surface; - const unsigned char *from_ptr = from_surface; - if (to_range.get(0) == from_range.get(0) && - to_range.get(0) == size.get(0)) - { - event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, - direction, dep_events)); - } - else - { - for (size_t y = 0; y < size.get(1); ++y) - { - event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), - direction, dep_events)); - to_ptr += to_range.get(0); - from_ptr += from_range.get(0); - } - } - to_surface += to_slice; - from_surface += from_slice; - } - break; - case host_to_device: - { - host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, - event_list); - std::vector<sycl::event> host_events; - if (to_slice == size_slice) - { - // Copy host data to a temp host buffer with the shape of target. - host_events = - dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, - host_to_host, dep_events); - } - else - { - // Copy host data to a temp host buffer with the shape of target. - host_events = dpct_memcpy( - q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, - // If has padding data, not sure whether it is useless. So fill temp - // buffer with it. - std::vector<sycl::event>{ - dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), - device_to_host, dep_events)}); - } - // Copy from temp host buffer to device with only one submit. - event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), - buf.get_size(), host_to_device, - host_events)); - break; - } - case device_to_host: - { - host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, - event_list); - // Copy from host temp buffer to host target with reshaping. - event_list = dpct_memcpy( - q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), - sycl::id<3>(0, 0, 0), size, host_to_host, - // Copy from device to temp host buffer with only one submit. - std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface, - buf.get_size(), - device_to_host, dep_events)}); - break; - } - case device_to_device: - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.parallel_for<class dpct_memcpy_3d_detail>( - size, - [=](sycl::id<3> id) { - to_surface[get_offset(id, to_slice, to_range.get(0))] = - from_surface[get_offset(id, from_slice, from_range.get(0))]; - }); })); - break; - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } - return event_list; - } - - /// memcpy 2D/3D matrix specified by pitched_data. - static inline std::vector<sycl::event> - dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, - pitched_data from, sycl::id<3> from_id, sycl::range<3> size, - memcpy_direction direction = automatic) - { - return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), - sycl::range<3>(to.get_pitch(), to.get_y(), 1), - sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, - size, direction); - } - - /// memcpy 2D matrix with pitch. - static inline std::vector<sycl::event> - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, - size_t to_pitch, size_t from_pitch, size_t x, size_t y, - memcpy_direction direction = automatic) - { - return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), - sycl::range<3>(from_pitch, y, 1), - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), - sycl::range<3>(x, y, 1), direction); - } - - inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans, - oneapi::mkl::transpose b_trans, int m, int n, int k, - const void *alpha, const void *a, library_data_t a_type, - int lda, const void *b, library_data_t b_type, int ldb, - const void *beta, void *c, library_data_t c_type, int ldc, - library_data_t scaling_type) - { - if (scaling_type == library_data_t::real_float && - c_type == library_data_t::complex_float) - { - scaling_type = library_data_t::complex_float; - } - else if (scaling_type == library_data_t::real_double && - c_type == library_data_t::complex_double) - { - scaling_type = library_data_t::complex_double; - } - - std::uint64_t key = - detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); - switch (key) - { - case detail::get_type_combination_id( - library_data_t::real_float, library_data_t::real_float, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_impl<float, float, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_double, library_data_t::real_double, - library_data_t::real_double, library_data_t::real_double): - { - detail::gemm_impl<double, double, double, double>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_float, library_data_t::complex_float, - library_data_t::complex_float, library_data_t::complex_float): - { - detail::gemm_impl<std::complex<float>, std::complex<float>, - std::complex<float>, std::complex<float>>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_double, library_data_t::complex_double, - library_data_t::complex_double, library_data_t::complex_double): - { - detail::gemm_impl<std::complex<double>, std::complex<double>, - std::complex<double>, std::complex<double>>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_half): - { - detail::gemm_impl<sycl::half, sycl::half, sycl::half, - sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a, - lda, b, ldb, beta, c, ldc); - break; - } -#ifdef __INTEL_MKL__ - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, - float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, - ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_impl<sycl::half, sycl::half, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_float): - { - float alpha_value = - dpct::get_value(reinterpret_cast<const float *>(alpha), q); - float beta_value = - dpct::get_value(reinterpret_cast<const float *>(beta), q); - sycl::half alpha_half(alpha_value); - sycl::half beta_half(beta_value); - detail::gemm_impl<sycl::half, sycl::half, sycl::half, - sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half, - a, lda, b, ldb, &beta_half, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_impl<std::int8_t, std::int8_t, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_bfloat16, library_data_t::real_float): - { - detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, - oneapi::mkl::bfloat16, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_int32, library_data_t::real_int32): - { - float alpha_float = - dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q); - float beta_float = - dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q); - detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>( - q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc); - break; - } -#endif // __INTEL_MKL__ - default: - throw std::runtime_error("the combination of data type is unsupported"); - } - } // gemm() - - /// Computes a batch of matrix-matrix product with general matrices. - /// \param [in] q The queue where the routine should be executed. - /// \param [in] a_trans Specifies the operation applied to A. - /// \param [in] b_trans Specifies the operation applied to B. - /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. - /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. - /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). - /// \param [in] alpha Scaling factor for the matrix-matrix product. - /// \param [in] a Input matrix A. - /// \param [in] a_type Data type of the matrix A. - /// \param [in] lda Leading dimension of A. - /// \param [in] b Input matrix B. - /// \param [in] b_type Data type of the matrix B. - /// \param [in] ldb Leading dimension of B. - /// \param [in] beta Scaling factor for matrix C. - /// \param [in, out] c Input/Output matrix C. - /// \param [in] c_type Data type of the matrix C. - /// \param [in] ldc Leading dimension of C. - /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. - /// \param [in] scaling_type Data type of the scaling factors. - inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans, - oneapi::mkl::transpose b_trans, int m, int n, int k, - const void *alpha, const void *a[], - library_data_t a_type, int lda, const void *b[], - library_data_t b_type, int ldb, const void *beta, - void *c[], library_data_t c_type, int ldc, - int batch_size, library_data_t scaling_type) - { - if (scaling_type == library_data_t::real_float && - c_type == library_data_t::complex_float) - { - scaling_type = library_data_t::complex_float; - } - else if (scaling_type == library_data_t::real_double && - c_type == library_data_t::complex_double) - { - scaling_type = library_data_t::complex_double; - } - - std::uint64_t key = - detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); - switch (key) - { - case detail::get_type_combination_id( - library_data_t::real_float, library_data_t::real_float, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl<float, float, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_double, library_data_t::real_double, - library_data_t::real_double, library_data_t::real_double): - { - detail::gemm_batch_impl<double, double, double, double>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_float, library_data_t::complex_float, - library_data_t::complex_float, library_data_t::complex_float): - { - detail::gemm_batch_impl<std::complex<float>, std::complex<float>, - std::complex<float>, std::complex<float>>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_double, library_data_t::complex_double, - library_data_t::complex_double, library_data_t::complex_double): - { - detail::gemm_batch_impl<std::complex<double>, std::complex<double>, - std::complex<double>, std::complex<double>>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_half): - { - detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, - sycl::half>(q, a_trans, b_trans, m, n, k, alpha, - a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } -#ifdef __INTEL_MKL__ - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_bfloat16, library_data_t::real_float): - { - detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, - oneapi::mkl::bfloat16, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, - float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, - b, ldb, beta, c, ldc, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_int32, library_data_t::real_int32): - { - float alpha_float = - dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q); - float beta_float = - dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q); - detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, - float>(q, a_trans, b_trans, m, n, k, &alpha_float, - a, lda, b, ldb, &beta_float, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl<sycl::half, sycl::half, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - batch_size); - break; - } -#endif - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_float): - { - float alpha_value = - dpct::get_value(reinterpret_cast<const float *>(alpha), q); - float beta_value = - dpct::get_value(reinterpret_cast<const float *>(beta), q); - sycl::half alpha_half(alpha_value); - sycl::half beta_half(beta_value); - detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>( - q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, - batch_size); - break; - } - default: - throw std::runtime_error("the combination of data type is unsupported"); - } - } - - /// Computes a batch of matrix-matrix product with general matrices. - /// \param [in] q The queue where the routine should be executed. - /// \param [in] a_trans Specifies the operation applied to A. - /// \param [in] b_trans Specifies the operation applied to B. - /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. - /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. - /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). - /// \param [in] alpha Scaling factor for the matrix-matrix product. - /// \param [in] a Input matrix A. - /// \param [in] a_type Data type of the matrix A. - /// \param [in] lda Leading dimension of A. - /// \param [in] stride_a Stride between the different A matrices. - /// \param [in] b Input matrix B. - /// \param [in] b_type Data type of the matrix B. - /// \param [in] ldb Leading dimension of B. - /// \param [in] stride_b Stride between the different B matrices. - /// \param [in] beta Scaling factor for matrix C. - /// \param [in, out] c Input/Output matrix C. - /// \param [in] c_type Data type of the matrix C. - /// \param [in] ldc Leading dimension of C. - /// \param [in] stride_c Stride between the different C matrices. - /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. - /// \param [in] scaling_type Data type of the scaling factors. - inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans, - oneapi::mkl::transpose b_trans, int m, int n, int k, - const void *alpha, const void *a, library_data_t a_type, - int lda, long long int stride_a, const void *b, - library_data_t b_type, int ldb, long long int stride_b, - const void *beta, void *c, library_data_t c_type, - int ldc, long long int stride_c, int batch_size, - library_data_t scaling_type) - { - if (scaling_type == library_data_t::real_float && - c_type == library_data_t::complex_float) - { - scaling_type = library_data_t::complex_float; - } - else if (scaling_type == library_data_t::real_double && - c_type == library_data_t::complex_double) - { - scaling_type = library_data_t::complex_double; - } - - std::uint64_t key = - detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); - switch (key) - { - case detail::get_type_combination_id( - library_data_t::real_float, library_data_t::real_float, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl<float, float, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_double, library_data_t::real_double, - library_data_t::real_double, library_data_t::real_double): - { - detail::gemm_batch_impl<double, double, double, double>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_float, library_data_t::complex_float, - library_data_t::complex_float, library_data_t::complex_float): - { - detail::gemm_batch_impl<std::complex<float>, std::complex<float>, - std::complex<float>, std::complex<float>>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_double, library_data_t::complex_double, - library_data_t::complex_double, library_data_t::complex_double): - { - detail::gemm_batch_impl<std::complex<double>, std::complex<double>, - std::complex<double>, std::complex<double>>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_half): - { - detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, - sycl::half>(q, a_trans, b_trans, m, n, k, alpha, - a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } -#ifdef __INTEL_MKL__ - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_bfloat16, library_data_t::real_float): - { - detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, - oneapi::mkl::bfloat16, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, - float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, - stride_a, b, ldb, stride_b, beta, c, ldc, - stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_int32, library_data_t::real_int32): - { - detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, - std::int32_t>(q, a_trans, b_trans, m, n, k, alpha, - a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl<sycl::half, sycl::half, float, float>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } -#endif - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_float): - { - float alpha_value = - dpct::get_value(reinterpret_cast<const float *>(alpha), q); - float beta_value = - dpct::get_value(reinterpret_cast<const float *>(beta), q); - sycl::half alpha_half(alpha_value); - sycl::half beta_half(beta_value); - detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>( - q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b, - &beta_half, c, ldc, stride_c, batch_size); - break; - } - default: - throw std::runtime_error("the combination of data type is unsupported"); - } - } - - static inline void - async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr, - size_t from_pitch, size_t x, size_t y, - memcpy_direction direction = automatic, - sycl::queue &q = get_default_queue()) - { - detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y, - direction); - } - - using err0 = detail::generic_error_type<struct err0_tag, int>; - using err1 = detail::generic_error_type<struct err1_tag, int>; - - static inline void dpct_free(void *ptr, sycl::queue &q = get_default_queue()) { - detail::dpct_free(ptr, q); - } - - /// dpct accessor used as device function parameter. - template <class T, memory_region Memory, size_t Dimension> class accessor; - template <class T, memory_region Memory> class accessor<T, Memory, 3> { - public: - using memory_t = detail::memory_traits<Memory, T>; - using element_t = typename memory_t::element_t; - using pointer_t = typename memory_t::pointer_t; - using accessor_t = typename memory_t::template accessor_t<3>; - accessor(pointer_t data, const sycl::range<3> &in_range) - : _data(data), _range(in_range) {} - template <memory_region M = Memory> - accessor(typename std::enable_if<M != local, const accessor_t>::type &acc) - : accessor(acc, acc.get_range()) {} - accessor(const accessor_t &acc, const sycl::range<3> &in_range) - : accessor(acc.get_pointer(), in_range) {} - accessor<T, Memory, 2> operator[](size_t index) const { - sycl::range<2> sub(_range.get(1), _range.get(2)); - return accessor<T, Memory, 2>(_data + index * sub.size(), sub); - } - - pointer_t get_ptr() const { return _data; } - - private: - pointer_t _data; - sycl::range<3> _range; - }; - template <class T, memory_region Memory> class accessor<T, Memory, 2> { - public: - using memory_t = detail::memory_traits<Memory, T>; - using element_t = typename memory_t::element_t; - using pointer_t = typename memory_t::pointer_t; - using accessor_t = typename memory_t::template accessor_t<2>; - accessor(pointer_t data, const sycl::range<2> &in_range) - : _data(data), _range(in_range) {} - template <memory_region M = Memory> - accessor(typename std::enable_if<M != local, const accessor_t>::type &acc) - : accessor(acc, acc.get_range()) {} - accessor(const accessor_t &acc, const sycl::range<2> &in_range) - : accessor(acc.get_pointer(), in_range) {} - - pointer_t operator[](size_t index) const { - return _data + _range.get(1) * index; - } - - pointer_t get_ptr() const { return _data; } - - private: - pointer_t _data; - sycl::range<2> _range; - }; - - namespace detail { - /// Device variable with address space of shared, global or constant. - template <class T, memory_region Memory, size_t Dimension> class device_memory { - public: - using accessor_t = - typename detail::memory_traits<Memory, - T>::template accessor_t<Dimension>; - using value_t = typename detail::memory_traits<Memory, T>::value_t; - using dpct_accessor_t = dpct::accessor<T, Memory, Dimension>; - - device_memory() : device_memory(sycl::range<Dimension>(1)) {} - - /// Constructor of 1-D array with initializer list - device_memory(const sycl::range<Dimension> &in_range, - std::initializer_list<value_t> &&init_list) - : device_memory(in_range) { - assert(init_list.size() <= in_range.size()); - _host_ptr = (value_t *)std::malloc(_size); - std::memset(_host_ptr, 0, _size); - std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T)); - } - - /// Constructor of 2-D array with initializer list - template <size_t D = Dimension> - device_memory( - const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range, - std::initializer_list<std::initializer_list<value_t>> &&init_list) - : device_memory(in_range) { - assert(init_list.size() <= in_range[0]); - _host_ptr = (value_t *)std::malloc(_size); - std::memset(_host_ptr, 0, _size); - auto tmp_data = _host_ptr; - for (auto sub_list : init_list) { - assert(sub_list.size() <= in_range[1]); - std::memcpy(tmp_data, sub_list.begin(), - sub_list.size() * sizeof(T)); - tmp_data += in_range[1]; - } - } - - /// Constructor with range - device_memory(const sycl::range<Dimension> &range_in) - : _size(range_in.size() * sizeof(T)), _range(range_in), - _reference(false), _host_ptr(nullptr), _device_ptr(nullptr) { - static_assert( - (Memory == global) || (Memory == constant) || (Memory == shared), - "device memory region should be global, constant or shared"); - // Make sure that singleton class mem_mgr and dev_mgr will destruct - // later than this. - detail::mem_mgr::instance(); - dev_mgr::instance(); - } - - /// Constructor with range - template <class... Args> - device_memory(Args... Arguments) - : device_memory(sycl::range<Dimension>(Arguments...)) {} - - ~device_memory() { - if (_device_ptr && !_reference) - dpct::dpct_free(_device_ptr); - if (_host_ptr) - std::free(_host_ptr); - } - - /// Allocate memory with default queue, and init memory if has initial - /// value. - void init() { init(dpct::get_default_queue()); } - /// Allocate memory with specified queue, and init memory if has initial - /// value. - void init(sycl::queue &q) { - if (_device_ptr) - return; - if (!_size) - return; - allocate_device(q); - if (_host_ptr) - detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size, - host_to_device); - } - - /// The variable is assigned to a device pointer. - void assign(value_t *src, size_t size) { - this->~device_memory(); - new (this) device_memory(src, size); - } - - /// Get memory pointer of the memory object, which is virtual pointer when - /// usm is not used, and device pointer when usm is used. - value_t *get_ptr() { return get_ptr(get_default_queue()); } - /// Get memory pointer of the memory object, which is virtual pointer when - /// usm is not used, and device pointer when usm is used. - value_t *get_ptr(sycl::queue &q) { - init(q); - return _device_ptr; - } - - /// Get the device memory object size in bytes. - size_t get_size() { return _size; } - - template <size_t D = Dimension> - typename std::enable_if<D == 1, T>::type &operator[](size_t index) { - init(); - return _device_ptr[index]; - } - - /// Get dpct::accessor with dimension info for the device memory object - /// when usm is used and dimension is greater than 1. - template <size_t D = Dimension> - typename std::enable_if<D != 1, dpct_accessor_t>::type - get_access([[maybe_unused]] sycl::handler &cgh) { - return dpct_accessor_t((T *)_device_ptr, _range); - } - - private: - device_memory(value_t *memory_ptr, size_t size) - : _size(size), _range(size / sizeof(T)), _reference(true), - _device_ptr(memory_ptr) {} - - void allocate_device(sycl::queue &q) { - #ifndef DPCT_USM_LEVEL_NONE - if (Memory == shared) { - _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(), - q.get_context()); - return; - } - #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY - if (Memory == constant) { - _device_ptr = (value_t *)sycl::malloc_device( - _size, q.get_device(), q.get_context(), - sycl::ext::oneapi::property::usm::device_read_only()); - return; - } - #endif - #endif - _device_ptr = (value_t *)detail::dpct_malloc(_size, q); - } - - size_t _size; - sycl::range<Dimension> _range; - bool _reference; - value_t *_host_ptr; - value_t *_device_ptr; - }; - template <class T, memory_region Memory> - class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> { - public: - using base = device_memory<T, Memory, 1>; - using value_t = typename base::value_t; - using accessor_t = - typename detail::memory_traits<Memory, T>::template accessor_t<0>; - - /// Constructor with initial value. - device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {} - - /// Default constructor - device_memory() : base(1) {} - }; - } // namespace detail - - template <class T, size_t Dimension> - using global_memory = detail::device_memory<T, global, Dimension>; - template <class T, size_t Dimension> - using constant_memory = detail::device_memory<T, constant, Dimension>; - template <class T, size_t Dimension> - using shared_memory = detail::device_memory<T, shared, Dimension>; - - - template <typename T, - sycl::access::address_space addressSpace = - sycl::access::address_space::global_space, - sycl::memory_order memoryOrder = sycl::memory_order::relaxed, - sycl::memory_scope memoryScope = sycl::memory_scope::device> - inline T atomic_fetch_add(T *addr, T operand) { - auto atm = - sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]); - return atm.fetch_add(operand); - } - - template <sycl::access::address_space addressSpace = - sycl::access::address_space::global_space, - sycl::memory_order memoryOrder = sycl::memory_order::relaxed, - sycl::memory_scope memoryScope = sycl::memory_scope::device, - typename T1, typename T2> - inline T1 atomic_fetch_add(T1 *addr, T2 operand) { - auto atm = - sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]); - return atm.fetch_add(operand); - } - - template <typename T, sycl::access::address_space addressSpace = - sycl::access::address_space::global_space> - inline T atomic_fetch_add(T *addr, T operand, - sycl::memory_order memoryOrder) { - switch (memoryOrder) { - case sycl::memory_order::relaxed: - return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed, - sycl::memory_scope::device>(addr, operand); - case sycl::memory_order::acq_rel: - return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel, - sycl::memory_scope::device>(addr, operand); - case sycl::memory_order::seq_cst: - return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst, - sycl::memory_scope::device>(addr, operand); - default: - assert(false && "Invalid memory_order for atomics. Valid memory_order for " - "atomics are: sycl::memory_order::relaxed, " - "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); - } - } - - template <sycl::access::address_space addressSpace = - sycl::access::address_space::global_space, - typename T1, typename T2> - inline T1 atomic_fetch_add(T1 *addr, T2 operand, - sycl::memory_order memoryOrder) { - atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder); - } - -} // COPY from DPCT head files - -#endif // GGML_SYCL_DPCT_HELPER_HPP |