diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2024-09-29 09:03:52 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-09-29 09:03:52 +0300 |
commit | fd20638bbcb4b1ba69783312bb78545fa418d3f2 (patch) | |
tree | b9ff7dee2adeb73da4a77a7b55573efa7ccae931 /ggml | |
parent | 1b789c983ac34679de67174662b25fb1227cebf2 (diff) |
Allow bf16 kv-cache (#69)
On the CPU I get the exact same PPL with and without FA
using bf16 for kv-cache. But on CUDA the bf16 kv-cache
result is about the same as the fp16 kv-cache CPU result,
so I'm missing some conversion somewhere.
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml')
-rw-r--r-- | ggml/src/ggml-cuda.cu | 3 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/cpy.cu | 21 |
2 files changed, 24 insertions, 0 deletions
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 3d24cc6f..617dd58f 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2862,6 +2862,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return true; } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) { + return true; + } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { return true; } diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index aad34bfe..6ec3b5f3 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -16,6 +16,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { *dsti = __float2half(*xi); } +static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti; + + *dsti = __float2bfloat16(*xi); +} + static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) { const half * xi = (const half *) cxi; half * dsti = (half *) cdsti; @@ -318,6 +325,16 @@ static void ggml_cpy_f32_f16_cuda( (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } +static void ggml_cpy_f32_bf16_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + static void ggml_cpy_f32_q8_0_cuda( const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, @@ -432,6 +449,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { + ggml_cpy_f32_bf16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { @@ -465,6 +484,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { return (void*) cpy_f32_f16<cpy_1_f32_f32>; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { return (void*) cpy_f32_f16<cpy_1_f32_f16>; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { + return (void*) cpy_f32_f16<cpy_1_f32_bf16>; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { |