diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-08-25 11:55:59 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-25 11:55:59 +0300 |
commit | 3f460a2b723c8b936ac29ecfd02f244b3adeba55 (patch) | |
tree | 3159656f14a6646d745d2900452f83f9bc9ebed0 /ggml-cuda.cu | |
parent | 87e3733f24a85d894cc16e1cbdfa1ea1e81a76f3 (diff) |
cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)
* cuda : add RoPE kernel for mode == 2 (NeoX)
* falcon : do not offload the embeddings layer
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r-- | ggml-cuda.cu | 58 |
1 files changed, 33 insertions, 25 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 868b7a7b..3bd1caf2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3907,28 +3907,27 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c dst[i + 1] = x0*sin_theta + x1*cos_theta; } -// TODO: this implementation is wrong! -//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0, -// const float p_delta, const int p_delta_rows, const float theta_scale) { -// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); -// -// if (col >= ncols) { -// return; -// } -// -// const int row = blockDim.x*blockIdx.x + threadIdx.x; -// const int i = row*ncols + col/2; -// -// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); -// const float sin_theta = sinf(theta); -// const float cos_theta = cosf(theta); -// -// const float x0 = x[i + 0]; -// const float x1 = x[i + ncols/2]; -// -// dst[i + 0] = x0*cos_theta - x1*sin_theta; -// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; -//} +static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale) { + const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); + + if (col >= ncols) { + return; + } + + const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int i = row*ncols + col/2; + + const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + ncols/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; +} static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { const int col = blockDim.x*blockIdx.x + threadIdx.x; @@ -4799,13 +4798,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { - GGML_ASSERT(nrows % 2 == 0); + GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); } +static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { + const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(nrows, num_blocks_x, 1); + rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); +} + static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { GGML_ASSERT(nrows % 4 == 0); const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1); @@ -5548,8 +5555,9 @@ inline void ggml_cuda_op_rope( const float block_p = max(p - (n_ctx - 2.f), 0.f); rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); } else if (is_neox) { - GGML_ASSERT(false && "RoPE NeoX not implemented yet"); -#pragma message("TODO: implement RoPE NeoX for CUDA") + GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"); + const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; + rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); } else { const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); |