summaryrefslogtreecommitdiff
path: root/ggml/src/ggml.c
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-07-27 08:44:18 +0200
committerGitHub <noreply@github.com>2024-07-27 08:44:18 +0200
commitf62615b44f7df586cb58ed9fffca59b96820117b (patch)
tree422a2b063fd1ba3ef9090c701f4980359d7a4a18 /ggml/src/ggml.c
parent154e0d75fccf1784fe9ff6fd76a630b66563da3d (diff)
Simdify and multi-thread tanh (#4)
It seemed Gemma-2 performance is lower than expected for its size. Looking at the architecture, I noticed that tanh is used in each layer, and then at the end for softcaping the final output. ggml had tanh set to be computed with a single thread. Combined with tanh(x) being a pretty expensive operation, this resulted in a significant fraction of the time being spent in the tanh operation. After multi-threading ggml_vec_soft_max_f32 and simd-ifying the tanh computation, I observe a 33% gain in prompt processing speed (!!!) TG is of course memory bound, but despite this, we still get a ~2% boost at 4 threads (which gives max TG performance on my Ryzen-7950X). Simd-ifying: We have tanh(x) = (exp(2*x) - 1)/(exp(2*x) + 1) so we can just use Justine Tunney's SIMD exp implementation. Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml.c')
-rw-r--r--ggml/src/ggml.c63
1 files changed, 57 insertions, 6 deletions
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 95a1fc7d..c3cda4c4 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2265,7 +2265,7 @@ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) {
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
+//inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
@@ -2389,6 +2389,13 @@ inline static float32x4_t ggml_v_silu(float32x4_t x) {
return vdivq_f32(x, one_plus_exp_neg_x);
}
+inline static float32x4_t ggml_v_tanh(float32x4_t x) {
+ const float32x4_t one = vdupq_n_f32(1.0f);
+ const float32x4_t two_x = vmulq_f32(x, vdupq_n_f32(2.f));
+ const float32x4_t exp_two_x = ggml_v_expf(two_x);
+ return vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one));
+}
+
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
// adapted from arm limited optimized routine
@@ -2432,6 +2439,12 @@ inline static __m512 ggml_v_silu(__m512 x) {
return _mm512_div_ps(x, one_plus_exp_neg_x);
}
+inline static __m512 ggml_v_tanh(__m512 x) {
+ const __m512 one = _mm512_set1_ps(1.0f);
+ const __m512 exp_two_x = ggml_v_expf(_mm512_mul_ps(x, _mm512_set1_ps(2.f)));
+ return _mm512_div_ps(_mm512_sub_ps(exp_two_x, one), _mm512_add_ps(exp_two_x, one));
+}
+
#elif defined(__AVX2__) && defined(__FMA__)
// adapted from arm limited optimized routine
@@ -2487,6 +2500,12 @@ inline static __m256 ggml_v_silu(__m256 x) {
return _mm256_div_ps(x, one_plus_exp_neg_x);
}
+inline static __m256 ggml_v_tanh(__m256 x) {
+ const __m256 one = _mm256_set1_ps(1.0f);
+ const __m256 exp_two_x = ggml_v_expf(_mm256_mul_ps(x, _mm256_set1_ps(2.f)));
+ return _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one));
+}
+
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
#if defined(__FMA__)
@@ -2541,6 +2560,12 @@ inline static __m128 ggml_v_silu(__m128 x) {
return _mm_div_ps(x, one_plus_exp_neg_x);
}
+inline static __m128 ggml_v_tanh(__m128 x) {
+ const __m128 one = _mm_set1_ps(1.0f);
+ const __m128 exp_two_x = ggml_v_expf(_mm_mul_ps(x, _mm_set1_ps(2.f)));
+ return _mm_div_ps(_mm_sub_ps(exp_two_x, one), _mm_add_ps(exp_two_x, one));
+}
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__
static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
@@ -2567,6 +2592,30 @@ static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
}
}
+static void ggml_vec_tanh_f32(const int n, float * y, const float * x) {
+ int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+ for (; i + 15 < n; i += 16) {
+ _mm512_storeu_ps(y + i, ggml_v_tanh(_mm512_loadu_ps(x + i)));
+ }
+#elif defined(__AVX2__) && defined(__FMA__)
+ for (; i + 7 < n; i += 8) {
+ _mm256_storeu_ps(y + i, ggml_v_tanh(_mm256_loadu_ps(x + i)));
+ }
+#elif defined(__SSE2__)
+ for (; i + 3 < n; i += 4) {
+ _mm_storeu_ps(y + i, ggml_v_tanh(_mm_loadu_ps(x + i)));
+ }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+ for (; i + 3 < n; i += 4) {
+ vst1q_f32(y + i, ggml_v_tanh(vld1q_f32(x + i)));
+ }
+#endif
+ for (; i < n; ++i) {
+ y[i] = tanhf(x[i]);
+ }
+}
+
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
int i = 0;
ggml_float sum = 0;
@@ -11204,9 +11253,8 @@ static void ggml_compute_forward_tanh_f32(
const struct ggml_tensor * src0 = dst->src[0];
- if (params->ith != 0) {
- return;
- }
+ const int ith = params->ith;
+ const int nth = params->nth;
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
@@ -11215,7 +11263,7 @@ static void ggml_compute_forward_tanh_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
- for (int i = 0; i < n; i++) {
+ for (int i = ith; i < n; i += nth) {
ggml_vec_tanh_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
(float *) ((char *) src0->data + i*(src0->nb[1])));
@@ -18590,7 +18638,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_UNARY_OP_SGN:
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_STEP:
- case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_SIGMOID:
@@ -18606,6 +18653,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = n_threads;
} break;
+ case GGML_UNARY_OP_TANH:
+ {
+ n_tasks = MIN(ggml_nrows(node), n_threads);
+ } break;
default:
GGML_ASSERT(false);
}