summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIwan Kawrakow <iwan.kawrakow@gmail.com>2024-07-18 11:39:32 +0300
committerIwan Kawrakow <iwan.kawrakow@gmail.com>2024-07-18 11:39:32 +0300
commit744eb9ffa955fa3557cc835995e45448c3c06bcb (patch)
treeb4e7e894597d6486d866b1814d576236f694d999
parent6a132862fd3826d241c0c6f43e5f91450626eeb2 (diff)
iqk_mul_mat(float): make it work for row sizes that are multiple of 4 on AVX2
I was trying to understand where the Bitnet bottleneck is, and at some point noticed the Q*K matrixt multiplication where Q and K have the shape of 100 x n_token x 32 x 1. The existing iqk_mul_mat for floats rerquiers that the row size is a multiple of the SIMD vector size (so, 16 on the Ryzen-7950X, 8 on the Ryzen-5975), and hence this matrix multiiplication was getting done with ggml. Changing the iqk_mul_mat float kernel to handle row sizes that are a multiple of 4 (via __m128 for the last values in a row) resulted in nearly a 20% performance boost for PP-512 and ~3% for TG-128! If I go to a context of 2048, PP performance increases by nearly 70%!
-rw-r--r--iqk_mul_mat.cpp25
1 files changed, 24 insertions, 1 deletions
diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp
index ade0add8..c902af14 100644
--- a/iqk_mul_mat.cpp
+++ b/iqk_mul_mat.cpp
@@ -2479,6 +2479,10 @@ struct QFBase {
static inline float hsum(Acc acc) {
return _mm512_reduce_add_ps(acc);
}
+ template <typename Float>
+ static inline Data load4Floats(const Float * x) {
+ return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0);
+ }
#else
constexpr static int k_step = 8;
using Data = __m256;
@@ -2494,7 +2498,13 @@ struct QFBase {
static inline float hsum(Acc acc) {
return hsum_float_8(acc);
}
+ template <typename Float>
+ static inline Data load4Floats(const Float * x) {
+ return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0);
+ }
#endif
+ static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); }
+ static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); }
};
template <typename Float, int nrc_in> struct QFT final : public QFBase {
constexpr static int nrc = nrc_in;
@@ -2505,6 +2515,7 @@ template <typename Float, int nrc_in> struct QFT final : public QFBase {
for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx);
}
IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); }
+ IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); }
const Float * y[nrc];
};
@@ -2512,6 +2523,7 @@ template <typename Qy, typename Qx>
IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) {
assert(n%QFBase::k_step == 0);
int nb = n/QFBase::k_step;
+ int nb4 = n/4;
Qy y(info);
Qx x(cx + ix0*bx, bx);
QFBase::Data xv[Qx::nrc];
@@ -2536,6 +2548,17 @@ IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0,
for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
}
}
+ for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) {
+ yv = y.load_tail(0, i);
+ for (int ix = 0; ix < Qx::nrc; ++ix) {
+ xv[ix] = x.load_tail(ix, i);
+ acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
+ }
+ for (int iy = 1; iy < Qy::nrc; ++iy) {
+ yv = y.load_tail(iy, i);
+ for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
+ }
+ }
for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix]));
}
@@ -2725,7 +2748,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
(void)Ny;
if (typeA == GGML_TYPE_F16 || typeA == GGML_TYPE_F32) {
- if (ne00 % QFBase::k_step) return false;
+ if (ne00 % 4) return false;
}
if (typeA == GGML_TYPE_F16) {
switch (typeB) {