diff options
Diffstat (limited to 'ggml-cuda/common.cuh')
-rw-r--r-- | ggml-cuda/common.cuh | 871 |
1 files changed, 0 insertions, 871 deletions
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh deleted file mode 100644 index 1c2d7215..00000000 --- a/ggml-cuda/common.cuh +++ /dev/null @@ -1,871 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-cuda.h" - -#include <memory> - -#if defined(GGML_USE_HIPBLAS) -#define GGML_COMMON_DECL_HIP -#define GGML_COMMON_IMPL_HIP -#else -#define GGML_COMMON_DECL_CUDA -#define GGML_COMMON_IMPL_CUDA -#endif -#include "ggml-common.h" - -#include <cstdio> -#include <array> -#include <cassert> -#include <cfloat> -#include <string> -#include <vector> - -#if defined(GGML_USE_HIPBLAS) -#include <hip/hip_runtime.h> -#include <hipblas/hipblas.h> -#include <hip/hip_fp16.h> -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 -#define cublasCreate hipblasCreate -#define cublasDestroy hipblasDestroy -#define cublasGemmEx hipblasGemmEx -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaEventDestroy hipEventDestroy -#define cudaFree hipFree -#define cudaFreeHost hipHostFree -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaHostRegister hipHostRegister -#define cudaHostRegisterPortable hipHostRegisterPortable -#define cudaHostRegisterReadOnly hipHostRegisterReadOnly -#define cudaHostUnregister hipHostUnregister -#define cudaLaunchHostFunc hipLaunchHostFunc -#define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) -#define cudaMemcpy hipMemcpy -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyPeerAsync hipMemcpyPeerAsync -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemcpyKind hipMemcpyKind -#define cudaMemset hipMemset -#define cudaMemsetAsync hipMemsetAsync -#define cudaMemGetInfo hipMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaSetDevice hipSetDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamFireAndForget hipStreamFireAndForget -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamPerThread hipStreamPerThread -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define __trap abort -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#else -#include <cuda_runtime.h> -#include <cuda.h> -#include <cublas_v2.h> -#include <cuda_fp16.h> - -#if CUDART_VERSION < 11020 -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t -#endif // CUDART_VERSION < 11020 - -#endif // defined(GGML_USE_HIPBLAS) - -#define STRINGIZE_IMPL(...) #__VA_ARGS__ -#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) - -#define WARP_SIZE 32 -#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) -#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons - -#define CC_PASCAL 600 -#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#define CC_VOLTA 700 -#define CC_TURING 750 -#define CC_AMPERE 800 -#define CC_OFFSET_AMD 1000000 -#define CC_RDNA1 (CC_OFFSET_AMD + 1010) -#define CC_RDNA2 (CC_OFFSET_AMD + 1030) -#define CC_RDNA3 (CC_OFFSET_AMD + 1100) - -// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication -// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant -// for large computational tasks. the drawback is that this requires some extra amount of VRAM: -// - 7B quantum model: +100-200 MB -// - 13B quantum model: +200-400 MB -// -//#define GGML_CUDA_FORCE_MMQ - -// TODO: improve this to be correct for more hardware -// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores -#if !defined(GGML_CUDA_FORCE_MMQ) -#define CUDA_USE_TENSOR_CORES -#endif - -#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels -#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available - -#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -#define GGML_CUDA_MAX_STREAMS 8 - -[[noreturn]] -void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); - -#define CUDA_CHECK_GEN(err, success, error_fn) \ - do { \ - auto err_ = (err); \ - if (err_ != (success)) { \ - ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ - } \ - } while (0) - -#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) - -#if CUDART_VERSION >= 12000 - static const char * cublas_get_error_str(const cublasStatus_t err) { - return cublasGetStatusString(err); - } -#else - static const char * cublas_get_error_str(const cublasStatus_t err) { - switch (err) { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; - default: return "unknown error"; - } - } -#endif // CUDART_VERSION >= 12000 - -#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) - -#if !defined(GGML_USE_HIPBLAS) -static const char * cu_get_error_str(CUresult err) { - const char * err_str; - cuGetErrorString(err, &err_str); - return err_str; -} -#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) -#endif - -#if CUDART_VERSION >= 11100 -#define GGML_CUDA_ASSUME(x) __builtin_assume(x) -#else -#define GGML_CUDA_ASSUME(x) -#endif // CUDART_VERSION >= 11100 - -#ifdef GGML_CUDA_F16 -typedef half dfloat; // dequantize float -typedef half2 dfloat2; -#else -typedef float dfloat; // dequantize float -typedef float2 dfloat2; -#endif //GGML_CUDA_F16 - -#if defined(GGML_USE_HIPBLAS) -#define __CUDA_ARCH__ 1300 - -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ - defined(__gfx1150__) || defined(__gfx1151__) -#define RDNA3 -#endif - -#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) -#define RDNA2 -#endif - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4(const int a, const int b) { - const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); - const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast<const int &>(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max(); - if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min(); - c[i] = tmp; - } - return reinterpret_cast<int &>(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} - -static __device__ __forceinline__ int __vsub4(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a); - const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { -#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) - c = __builtin_amdgcn_sdot4(a, b, c, false); -#elif defined(RDNA3) - c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); -#elif defined(__gfx1010__) || defined(__gfx900__) - int tmp1; - int tmp2; - asm("\n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - " - : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) - : "v"(a), "v"(b) - ); -#else - const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); - const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); - c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; -#endif - return c; -} - -#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -// __shfl_xor() for half2 was added in ROCm 5.6 -static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { - typedef union half2_b32 { - half2 val; - int b32; - } half2_b32_t; - half2_b32_t tmp; - tmp.val = var; - tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); - return tmp.val; -} -#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -#endif // defined(GGML_USE_HIPBLAS) - -#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL -#define FP16_AVAILABLE -#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL - -#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#define FAST_FP16_AVAILABLE -#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 - -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA -#define FP16_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA - -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING -#define INT8_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING - -static bool fast_fp16_available(const int cc) { - return cc >= CC_PASCAL && cc != 610; -} - -static bool fp16_mma_available(const int cc) { - return cc < CC_OFFSET_AMD && cc >= CC_VOLTA; -} - -static bool int8_mma_available(const int cc) { - return cc < CC_OFFSET_AMD && cc >= CC_TURING; -} - -[[noreturn]] -static __device__ void no_device_code( - const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", - file_name, line, function_name, arch); - GGML_UNUSED(arch_list); -#else - printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", - file_name, line, function_name, arch, arch_list); -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - __trap(); - - GGML_UNUSED(no_device_code); // suppress unused function warning -} - -#ifdef __CUDA_ARCH__ -#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) -#else -#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") -#endif // __CUDA_ARCH__ - -static __device__ __forceinline__ float warp_reduce_sum(float x) { -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); - } - return x; -} - -static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); - a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); - } - return a; -} - -static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { -#ifdef FP16_AVAILABLE - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); - reinterpret_cast<half&>(a.x) += __low2half(a_other); - reinterpret_cast<half&>(a.y) += __high2half(a_other); - } - return a; -#else -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); - } - return a; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - -#else - NO_DEVICE_CODE; - return a; -#endif // FP16_AVAILABLE -} - -static __device__ __forceinline__ float warp_reduce_max(float x) { -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); - } - return x; -} - -static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { -#ifdef FP16_AVAILABLE - -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX - return __float2half(fmaxf(__half2float(a), __half2float(b))); -#else - return __hmax(a, b); -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX - -#else - NO_DEVICE_CODE; - GGML_UNUSED(b); - return a; -#endif // FP16_AVAILABLE -} - -static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) - -#if CUDART_VERSION >= CUDART_HMAX - return __hmax2(a, b); -#else - half2 ret; - reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b))); - reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b))); - return ret; -#endif // CUDART_VERSION >= CUDART_HMAX - -#else - GGML_UNUSED(a); - GGML_UNUSED(b); - NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -} - -static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); - } - return x; -#else - GGML_UNUSED(x); - NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL -} - -#if CUDART_VERSION < CUDART_HMASK -static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) { - const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b))); - const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); - return mask_low | mask_high; -} -#endif // CUDART_VERSION < 12000 - -// TODO: move to ggml-common.h -static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - -typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); - -static __device__ __forceinline__ float get_alibi_slope( - const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1 -) { - if (max_bias <= 0.0f) { - return 1.0f; - } - const float base = h < n_head_log2 ? m0 : m1; - const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; - - return powf(base, exph); -} - -static __device__ __forceinline__ float iq1bn_fp8_to_float(uint8_t fp8) { - typedef union { float f; uint32_t i; } scale_t; - scale_t s; s.i = (((fp8 >> 5) + 116) << 23) | ((fp8 & 0x1f) << 18); - return s.f; -} - -template <ggml_type type> -struct ggml_cuda_type_traits; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_F16> { - static constexpr int qk = 1; - static constexpr int qr = 1; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> { - static constexpr int qk = QK4_0; - static constexpr int qr = QR4_0; - static constexpr int qi = QI4_0; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> { - static constexpr int qk = QK4_1; - static constexpr int qr = QR4_1; - static constexpr int qi = QI4_1; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> { - static constexpr int qk = QK5_0; - static constexpr int qr = QR5_0; - static constexpr int qi = QI5_0; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> { - static constexpr int qk = QK5_1; - static constexpr int qr = QR5_1; - static constexpr int qi = QI5_1; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> { - static constexpr int qk = QK8_0; - static constexpr int qr = QR8_0; - static constexpr int qi = QI8_0; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> { - static constexpr int qk = QK_K; - static constexpr int qr = QR2_K; - static constexpr int qi = QI2_K; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> { - static constexpr int qk = QK_K; - static constexpr int qr = QR3_K; - static constexpr int qi = QI3_K; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> { - static constexpr int qk = QK_K; - static constexpr int qr = QR4_K; - static constexpr int qi = QI4_K; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> { - static constexpr int qk = QK_K; - static constexpr int qr = QR5_K; - static constexpr int qi = QI5_K; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> { - static constexpr int qk = QK_K; - static constexpr int qr = QR6_K; - static constexpr int qi = QI6_K; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> { - static constexpr int qk = QK_K; - static constexpr int qr = QR2_XXS; - static constexpr int qi = QI2_XXS; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> { - static constexpr int qk = QK_K; - static constexpr int qr = QR2_XS; - static constexpr int qi = QI2_XS; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> { - static constexpr int qk = QK_K; - static constexpr int qr = QR2_S; - static constexpr int qi = QI2_S; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> { - static constexpr int qk = QK_K; - static constexpr int qr = QR3_XXS; - static constexpr int qi = QI3_XXS; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> { - static constexpr int qk = QK_K; - static constexpr int qr = QR1_S; - static constexpr int qi = QI1_S; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> { - static constexpr int qk = QK_K; - static constexpr int qr = QR1_M; - static constexpr int qi = QI1_M; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ1_BN> { - static constexpr int qk = QK_IQ1BN; - static constexpr int qr = QR1_BN; - static constexpr int qi = QI1_BN; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ2_BN> { - static constexpr int qk = QK_IQ1BN; - static constexpr int qr = QR1_BN; - static constexpr int qi = QI1_BN; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> { - static constexpr int qk = QK4_NL; - static constexpr int qr = QR4_NL; - static constexpr int qi = QI4_NL; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> { - static constexpr int qk = QK_K; - static constexpr int qr = QR4_XS; - static constexpr int qi = QI4_XS; -}; - -template<> -struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> { - static constexpr int qk = QK_K; - static constexpr int qr = QR3_S; - static constexpr int qi = QI3_S; -}; - -static int get_mmq_x_max_host(const int cc) { -#ifdef CUDA_USE_TENSOR_CORES - return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64; -#else - return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64; -#endif // CUDA_USE_TENSOR_CORES -} - -// Round rows to this value for --split-mode row: -static int get_mmq_y_host(const int cc) { - return cc >= CC_VOLTA ? 128 : 64; -} - -////////////////////// - -struct ggml_cuda_device_info { - int device_count; - - struct cuda_device_info { - int cc; // compute capability - int nsm; // number of streaming multiprocessors - size_t smpb; // max. shared memory per block - size_t smpbo; // max. shared memory per block (with opt-in) - bool vmm; // virtual memory support - size_t vmm_granularity; // granularity of virtual memory - size_t total_vram; - }; - - cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; - - std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {}; -}; - -const ggml_cuda_device_info & ggml_cuda_info(); - -void ggml_cuda_set_device(int device); -int ggml_cuda_get_device(); - -struct ggml_cuda_pool { - virtual ~ggml_cuda_pool() = default; - - virtual void * alloc(size_t size, size_t * actual_size) = 0; - virtual void free(void * ptr, size_t size) = 0; -}; - -template<typename T> -struct ggml_cuda_pool_alloc { - ggml_cuda_pool * pool = nullptr; - T * ptr = nullptr; - size_t actual_size = 0; - - ggml_cuda_pool_alloc() = default; - - explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) { - } - - ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) { - alloc(size); - } - - ~ggml_cuda_pool_alloc() { - if (ptr != nullptr) { - pool->free(ptr, actual_size); - } - } - - // size is in number of elements - T * alloc(size_t size) { - GGML_ASSERT(pool != nullptr); - GGML_ASSERT(ptr == nullptr); - ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); - return ptr; - } - - T * alloc(ggml_cuda_pool & pool, size_t size) { - this->pool = &pool; - return alloc(size); - } - - T * get() { - return ptr; - } - - ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete; - ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete; - ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete; - ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete; -}; - - -// backend interface - -struct ggml_tensor_extra_gpu { - void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors - cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs -}; - - -#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS) -#define USE_CUDA_GRAPH -#endif - -struct ggml_graph_node_properties { - void * node_address; - ggml_op node_op; - int64_t ne[GGML_MAX_DIMS]; - size_t nb[GGML_MAX_DIMS]; - void * src_address[GGML_MAX_SRC]; -}; - -struct ggml_cuda_graph { -#ifdef USE_CUDA_GRAPH - ~ggml_cuda_graph() { - if (instance != nullptr) { - CUDA_CHECK(cudaGraphExecDestroy(instance)); - } - if (graph != nullptr) { - CUDA_CHECK(cudaGraphDestroy(graph)); - } - } - cudaGraph_t graph = nullptr; - cudaGraphExec_t instance = nullptr; - size_t num_nodes = 0; - std::vector<cudaGraphNode_t> nodes; - std::vector<cudaKernelNodeParams> params; - bool disable_due_to_gpu_arch = false; - bool disable_due_to_too_many_updates = false; - bool disable_due_to_failed_graph_capture = false; - int number_consecutive_updates = 0; - std::vector<ggml_graph_node_properties> ggml_graph_properties; - std::vector<char **> updated_kernel_arg; -#endif -}; - -struct ggml_backend_cuda_context { - int device; - std::string name; - cudaEvent_t copy_event = nullptr; - - cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; - cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - - std::unique_ptr<ggml_cuda_graph> cuda_graph; - - explicit ggml_backend_cuda_context(int device) : - device(device), - name(GGML_CUDA_NAME + std::to_string(device)) { - } - - ~ggml_backend_cuda_context() { - if (copy_event != nullptr) { - CUDA_CHECK(cudaEventDestroy(copy_event)); - } - for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) { - for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) { - if (streams[i][j] != nullptr) { - CUDA_CHECK(cudaStreamDestroy(streams[i][j])); - } - } - if (cublas_handles[i] != nullptr) { - CUBLAS_CHECK(cublasDestroy(cublas_handles[i])); - } - } - } - - cudaStream_t stream(int device, int stream) { - if (streams[device][stream] == nullptr) { - ggml_cuda_set_device(device); - CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking)); - } - return streams[device][stream]; - } - - cudaStream_t stream() { - return stream(device, 0); - } - - cublasHandle_t cublas_handle(int device) { - if (cublas_handles[device] == nullptr) { - ggml_cuda_set_device(device); - CUBLAS_CHECK(cublasCreate(&cublas_handles[device])); - CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH)); - } - return cublas_handles[device]; - } - - cublasHandle_t cublas_handle() { - return cublas_handle(device); - } - - // pool - std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES]; - - static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device); - - ggml_cuda_pool & pool(int device) { - if (pools[device] == nullptr) { - pools[device] = new_pool_for_device(device); - } - return *pools[device]; - } - - ggml_cuda_pool & pool() { - return pool(device); - } -}; |