diff options
author | Andrew Godfrey <AndrewGodfrey@users.noreply.github.com> | 2023-11-01 04:49:04 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-01 13:49:04 +0200 |
commit | 73bdcb395ef9a997d9c02950c7cd4249546162cd (patch) | |
tree | 9cace5e626d13541dda1798fbee2d74b57874952 /ggml-cuda.cu | |
parent | f0e209324a7f663225791897877bf610f1af152d (diff) |
finetune : add -ngl parameter (#3762)
* Add '-ngl' support to finetune.cpp
* Add fprintf in ggml_cuda_op_add
When I tried CUDA offloading during finetuning following the readme, I got an assert here.
This probably isn't an important case because inference later gives a warning saying you should use f16 or f32 instead when using lora
* Add 'finetune.sh', which currently fails when using GPU
"error: operator (): Finetuning on tensors with type 'f16' is not yet supported"
* tweak finetune.sh
* Suppress some warnings in ggml.c
* Add f16 implementation to ggml_compute_forward_add_f16_f32
* Add an f16 case to ggml_add_cast_impl and llama_build_lora_finetune_graphs
* finetune.sh: Edit comments
* Add "add_f16_f32_f32_cuda"
* Tweak an error message
* finetune.sh: Add an optional LLAMA_MODEL_DIR variable
* finetune.sh: Add an optional LLAMA_TRAINING_DIR variable
* train : minor
* tabs to spaces
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r-- | ggml-cuda.cu | 17 |
1 files changed, 17 insertions, 0 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 1ba951f6..4e6e7cd9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -513,6 +513,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d dst[i] = __hadd(x[i], __float2half(y[i])); } +static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = __half2float(x[i]) + y[i]; +} + static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -4693,6 +4702,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k); } +static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k); +} + static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE; mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky); @@ -5996,7 +6010,10 @@ inline void ggml_cuda_op_add( add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream); } else { + fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type); GGML_ASSERT(false); } |