summaryrefslogtreecommitdiff
path: root/ggml/src/ggml-cuda/common.cuh
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src/ggml-cuda/common.cuh')
-rw-r--r--ggml/src/ggml-cuda/common.cuh56
1 files changed, 56 insertions, 0 deletions
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 2eba527f..0a7f7f83 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -46,10 +46,14 @@
#define CC_VOLTA 700
#define CC_TURING 750
#define CC_AMPERE 800
+#define CC_ADA_LOVELACE 890
#define CC_OFFSET_AMD 1000000
+#define CC_OFFSET_MTHREADS 0x0100000
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
+#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < CC_OFFSET_MTHREADS)
+#define GGML_CUDA_CC_IS_AMD(cc) (cc >= CC_OFFSET_AMD)
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
@@ -134,6 +138,49 @@ typedef float2 dfloat2;
#define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+#define CP_ASYNC_AVAILABLE
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+
+#ifdef __CUDA_ARCH_LIST__
+constexpr bool ggml_cuda_has_arch_impl(int) {
+ return false;
+}
+
+template<class ... Archs>
+constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
+ return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
+}
+
+constexpr bool ggml_cuda_has_arch(const int arch) {
+ return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
+}
+
+constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur) {
+ if (cur == 0) {
+ GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
+ }
+ return cur;
+}
+
+template<class ... Archs>
+constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
+ if (first <= arch && first > cur) {
+ return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
+ } else {
+ return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
+ }
+}
+
+constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
+ return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__);
+}
+#else
+static int ggml_cuda_highest_compiled_arch(const int arch) {
+ return arch;
+}
+#endif // __CUDA_ARCH_LIST__
+
static constexpr bool fast_fp16_available(const int cc) {
return cc >= CC_PASCAL && cc != 610;
}
@@ -146,6 +193,15 @@ static constexpr bool int8_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
}
+// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
+static bool new_mma_available(const int cc) {
+ return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= CC_TURING;
+}
+
+static bool cp_async_available(const int cc) {
+ return cc < CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= CC_AMPERE;
+}
+
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {