summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEngininja2 <139037756+Engininja2@users.noreply.github.com>2024-02-27 07:22:45 -0600
committerGitHub <noreply@github.com>2024-02-27 14:22:45 +0100
commitc24a2a6e6005e5d424301525a42ba45a4a362d30 (patch)
treec7cd06817e62628b667a660f4d9014f4bbc96d4a
parent1f30b7a9f1b86baa455072d3182b9ebeee0cd845 (diff)
cuda : replace remaining shfl_xor with calls to warp_reduce functions (#5744)
-rw-r--r--ggml-cuda.cu73
1 files changed, 24 insertions, 49 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 964fb735..caef65de 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -696,18 +696,20 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
return a;
}
-//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
-//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-//#pragma unroll
-// for (int mask = 16; mask > 0; mask >>= 1) {
-// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
-// }
-// return a;
-//#else
-// (void) a;
-// NO_DEVICE_CODE;
-//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-//}
+#ifdef GGML_CUDA_F16
+static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#pragma unroll
+ for (int mask = 16; mask > 0; mask >>= 1) {
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+ }
+ return a;
+#else
+ (void) a;
+ NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+}
+#endif // GGML_CUDA_F16
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
@@ -2521,10 +2523,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[row] = tmp;
@@ -2625,10 +2624,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[row] = tmp;
@@ -2761,10 +2757,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (tid == 0) {
dst[row] = tmp;
@@ -2877,10 +2870,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[row] = tmp;
@@ -2987,10 +2977,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (tid == 0) {
dst[row] = tmp;
@@ -3025,11 +3012,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
float amax = fabsf(xi);
float sum = xi;
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
- sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
- }
+ amax = warp_reduce_max(amax);
+ sum = warp_reduce_sum(sum);
const float d = amax / 127;
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
@@ -6222,10 +6206,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
}
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (tid == 0) {
#ifdef GGML_CUDA_F16
@@ -6275,10 +6256,7 @@ static __global__ void mul_mat_p021_f16_f32(
const int idst = channel*nrows_dst + row_dst;
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[idst] = tmp;
@@ -6321,10 +6299,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
}
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[idst] = tmp;