diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-01-15 07:48:06 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-15 07:48:06 +0200 |
commit | 4a3156de2fac9a8ee4279de7804d4e352dcfe121 (patch) | |
tree | e927e6fb6769d3447590c2f92bc34d432b83a858 | |
parent | a836c8f534ab789b02da149fbdaf7735500bff74 (diff) |
CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
-rw-r--r-- | ggml-cuda.cu | 77 |
1 files changed, 73 insertions, 4 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bd3814c7..a870718a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1105,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in #endif // GGML_CUDA_F16 } +template<typename dst_t> +static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { + + const int i = blockIdx.x; + + // assume 32 threads + const int tid = threadIdx.x; + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_0 * x = (const block_q4_0 *)vx + ib; + const float d = __half2float(x->d); + const float dm = -8*d; + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d * (q[l] & 0xF) + dm; + y[l+16] = d * (q[l] >> 4) + dm; + } +} + +template<typename dst_t> +static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { + + const int i = blockIdx.x; + + // assume 32 threads + const int tid = threadIdx.x; + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_1 * x = (const block_q4_1 *)vx + ib; + const float2 d = __half22float2(x->dm); + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d.x * (q[l] & 0xF) + d.y; + y[l+16] = d.x * (q[l] >> 4) + d.y; + } +} + //================================== k-quants template<typename dst_t> @@ -6254,6 +6309,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu } template<typename dst_t> +static void dequantize_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32); +} + +template<typename dst_t> +static void dequantize_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32); +} + +template<typename dst_t> static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y); @@ -6301,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { int id; switch (type) { case GGML_TYPE_Q4_0: - return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>; + return dequantize_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>; + return dequantize_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>; case GGML_TYPE_Q5_1: @@ -6338,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: - return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>; + return dequantize_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>; + return dequantize_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>; case GGML_TYPE_Q5_1: |