summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2024-02-19 09:04:45 +0100
committerGitHub <noreply@github.com>2024-02-19 10:04:45 +0200
commit3a9cb4ca6408c29423373dd6cd7aa78a58286c00 (patch)
tree86dd59d920f80e2ffc657639099e4c228063c04b
parent769a716e30ba1da46f709df1c00727d6869d30e7 (diff)
cuda, metal : fix nans in soft_max (#5574)
* cuda : fix nans in soft_max * metal : fix nans in soft_max --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
-rw-r--r--ggml-cuda.cu8
-rw-r--r--ggml-metal.metal8
2 files changed, 8 insertions, 8 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 933ebbc4..eef21350 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6205,7 +6205,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col;
- const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + slope*pos[col];
+ const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
vals[col] = val;
max_val = max(max_val, val);
@@ -9170,17 +9170,17 @@ static void ggml_cuda_op_soft_max(
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// positions tensor
- float * src2_dd = dst_dd; // default to avoid null checks in the kernel
+ float * src2_dd = nullptr;
cuda_pool_alloc<float> src2_f;
ggml_tensor * src2 = dst->src[2];
const bool use_src2 = src2 != nullptr;
if (use_src2) {
- const bool src2_on_device = use_src2 && src2->backend == GGML_BACKEND_GPU;
- ggml_tensor_extra_gpu * src2_extra = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
+ const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
if (src2_on_device) {
+ ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
src2_dd = (float *) src2_extra->data_device[g_main_device];
} else {
src2_dd = src2_f.alloc(ggml_nelements(src2));
diff --git a/ggml-metal.metal b/ggml-metal.metal
index d0a85a19..f0d77d44 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
float lmax = -INFINITY;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
- lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
}
// find the max value in the block
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
// parallel sum
float lsum = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
- const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
lsum += exp_psrc0;
pdst[i00] = exp_psrc0;
}
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
float4 lmax4 = -INFINITY;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
- lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
}
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
// parallel sum
float4 lsum4 = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
- const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4;
}