summaryrefslogtreecommitdiff
path: root/ggml-cuda/common.cuh
diff options
context:
space:
mode:
Diffstat (limited to 'ggml-cuda/common.cuh')
-rw-r--r--ggml-cuda/common.cuh550
1 files changed, 550 insertions, 0 deletions
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
new file mode 100644
index 00000000..33c8ed1d
--- /dev/null
+++ b/ggml-cuda/common.cuh
@@ -0,0 +1,550 @@
+#pragma once
+
+#include "../ggml.h"
+#include "../ggml-cuda.h"
+#include <memory>
+
+#if defined(GGML_USE_HIPBLAS)
+#define GGML_COMMON_DECL_HIP
+#define GGML_COMMON_IMPL_HIP
+#else
+#define GGML_COMMON_DECL_CUDA
+#define GGML_COMMON_IMPL_CUDA
+#endif
+#include "../ggml-common.h"
+
+#include <cstdio>
+#include <array>
+#include <cassert>
+#include <cfloat>
+#include <string>
+
+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F HIPBLAS_R_16F
+#define CUDA_R_32F HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#ifdef GGML_HIP_UMA
+#define cudaMalloc hipMallocManaged
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
+#else
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#endif
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define __trap abort
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#else
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
+
+#endif // defined(GGML_USE_HIPBLAS)
+
+#define STRINGIZE_IMPL(...) #__VA_ARGS__
+#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
+
+#define WARP_SIZE 32
+#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
+
+#define CC_PASCAL 600
+#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define CC_VOLTA 700
+#define CC_OFFSET_AMD 1000000
+#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
+#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
+#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
+
+// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
+// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
+// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
+// - 7B quantum model: +100-200 MB
+// - 13B quantum model: +200-400 MB
+//
+//#define GGML_CUDA_FORCE_MMQ
+
+// TODO: improve this to be correct for more hardware
+// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
+#if !defined(GGML_CUDA_FORCE_MMQ)
+#define CUDA_USE_TENSOR_CORES
+#endif
+
+#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
+#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
+
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define GGML_CUDA_MAX_STREAMS 8
+
+[[noreturn]]
+void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
+
+#define CUDA_CHECK_GEN(err, success, error_fn) \
+ do { \
+ auto err_ = (err); \
+ if (err_ != (success)) { \
+ ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
+ } \
+ } while (0)
+
+#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
+
+#if CUDART_VERSION >= 12000
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
+ return cublasGetStatusString(err);
+ }
+#else
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
+ switch (err) {
+ case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+ case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+ case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+ case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+ case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+ case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+ case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+ case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+ case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+ default: return "unknown error";
+ }
+ }
+#endif // CUDART_VERSION >= 12000
+
+#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
+
+#if !defined(GGML_USE_HIPBLAS)
+static const char * cu_get_error_str(CUresult err) {
+ const char * err_str;
+ cuGetErrorString(err, &err_str);
+ return err_str;
+}
+#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
+#endif
+
+#if CUDART_VERSION >= 11100
+#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_CUDA_ASSUME(x)
+#endif // CUDART_VERSION >= 11100
+
+#ifdef GGML_CUDA_F16
+typedef half dfloat; // dequantize float
+typedef half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef float2 dfloat2;
+#endif //GGML_CUDA_F16
+
+[[noreturn]]
+static __device__ void no_device_code(
+ const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+ printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
+ file_name, line, function_name, arch);
+ GGML_UNUSED(arch_list);
+#else
+ printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
+ file_name, line, function_name, arch, arch_list);
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+ __trap();
+
+ GGML_UNUSED(no_device_code); // suppress unused function warning
+}
+
+#ifdef __CUDA_ARCH__
+#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
+#else
+#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#endif // __CUDA_ARCH__
+
+static __device__ __forceinline__ float warp_reduce_sum(float x) {
+#pragma unroll
+ for (int mask = 16; mask > 0; mask >>= 1) {
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+ }
+ return x;
+}
+
+static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
+#pragma unroll
+ for (int mask = 16; mask > 0; mask >>= 1) {
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+ }
+ return a;
+}
+
+#ifdef GGML_CUDA_F16
+static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#pragma unroll
+ for (int mask = 16; mask > 0; mask >>= 1) {
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+ }
+ return a;
+#else
+ GGML_UNUSED(a);
+ NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+}
+#endif // GGML_CUDA_F16
+
+static __device__ __forceinline__ float warp_reduce_max(float x) {
+#pragma unroll
+ for (int mask = 16; mask > 0; mask >>= 1) {
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+ }
+ return x;
+}
+
+//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
+//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
+//#pragma unroll
+// for (int mask = 16; mask > 0; mask >>= 1) {
+// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+// }
+// return x;
+//#else
+// GGML_UNUSED(x);
+// NO_DEVICE_CODE;
+//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
+//}
+
+
+#if defined(GGML_USE_HIPBLAS)
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+ defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#ifndef __has_builtin
+ #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+ return reinterpret_cast<const int &>(c);
+#else
+ int8x4_t c;
+ int16_t tmp;
+#pragma unroll
+ for (int i = 0; i < 4; i++) {
+ tmp = va[i] - vb[i];
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+ c[i] = tmp;
+ }
+ return reinterpret_cast<int &>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __vsub4(const int a, const int b) {
+ return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+ unsigned int c;
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+ for (int i = 0; i < 4; ++i) {
+ vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+ }
+ return c;
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
+#elif defined(RDNA3)
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+#elif defined(__gfx1010__) || defined(__gfx900__)
+ int tmp1;
+ int tmp2;
+ asm("\n \
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
+ v_add3_u32 %0, %1, %2, %0 \n \
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
+ v_add3_u32 %0, %1, %2, %0 \n \
+ "
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
+ : "v"(a), "v"(b)
+ );
+#else
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+ return c;
+}
+#endif // defined(GGML_USE_HIPBLAS)
+
+// TODO: move to ggml-common.h
+static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+
+
+//////////////////////
+
+struct ggml_cuda_device_info {
+ int device_count;
+
+ struct cuda_device_info {
+ int cc; // compute capability
+ size_t smpb; // max. shared memory per block
+ bool vmm; // virtual memory support
+ size_t vmm_granularity; // granularity of virtual memory
+ size_t total_vram;
+ };
+
+ cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
+
+ std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
+};
+
+const ggml_cuda_device_info & ggml_cuda_info();
+
+void ggml_cuda_set_device(int device);
+int ggml_cuda_get_device();
+
+struct ggml_cuda_pool {
+ virtual ~ggml_cuda_pool() = default;
+
+ virtual void * alloc(size_t size, size_t * actual_size) = 0;
+ virtual void free(void * ptr, size_t size) = 0;
+};
+
+template<typename T>
+struct ggml_cuda_pool_alloc {
+ ggml_cuda_pool * pool = nullptr;
+ T * ptr = nullptr;
+ size_t actual_size = 0;
+
+ ggml_cuda_pool_alloc() = default;
+
+ explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
+ }
+
+ ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
+ alloc(size);
+ }
+
+ ~ggml_cuda_pool_alloc() {
+ if (ptr != nullptr) {
+ pool->free(ptr, actual_size);
+ }
+ }
+
+ // size is in number of elements
+ T * alloc(size_t size) {
+ GGML_ASSERT(pool != nullptr);
+ GGML_ASSERT(ptr == nullptr);
+ ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
+ return ptr;
+ }
+
+ T * alloc(ggml_cuda_pool & pool, size_t size) {
+ this->pool = &pool;
+ return alloc(size);
+ }
+
+ T * get() {
+ return ptr;
+ }
+
+ ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
+ ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
+ ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
+ ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
+};
+
+
+// backend interface
+
+struct ggml_tensor_extra_gpu {
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
+};
+
+struct ggml_backend_cuda_context {
+ int device;
+ std::string name;
+ cudaEvent_t copy_event = nullptr;
+
+ cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
+ cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+ explicit ggml_backend_cuda_context(int device) :
+ device(device),
+ name(GGML_CUDA_NAME + std::to_string(device)) {
+ }
+
+ ~ggml_backend_cuda_context() {
+ if (copy_event != nullptr) {
+ CUDA_CHECK(cudaEventDestroy(copy_event));
+ }
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
+ for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
+ if (streams[i][j] != nullptr) {
+ CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
+ }
+ }
+ if (cublas_handles[i] != nullptr) {
+ CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
+ }
+ }
+ }
+
+ cudaStream_t stream(int device, int stream) {
+ if (streams[device][stream] == nullptr) {
+ ggml_cuda_set_device(device);
+ CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
+ }
+ return streams[device][stream];
+ }
+
+ cudaStream_t stream() {
+ return stream(device, 0);
+ }
+
+ cublasHandle_t cublas_handle(int device) {
+ if (cublas_handles[device] == nullptr) {
+ ggml_cuda_set_device(device);
+ CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
+ CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
+ }
+ return cublas_handles[device];
+ }
+
+ cublasHandle_t cublas_handle() {
+ return cublas_handle(device);
+ }
+
+ // pool
+ std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
+
+ static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
+
+ ggml_cuda_pool & pool(int device) {
+ if (pools[device] == nullptr) {
+ pools[device] = new_pool_for_device(device);
+ }
+ return *pools[device];
+ }
+
+ ggml_cuda_pool & pool() {
+ return pool(device);
+ }
+};