From b2acd81c753a098ad8dfb7acf0daf8aebf0ee79a Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 10 Jun 2024 12:25:27 +0300 Subject: iqk_mul_mat: turn on AVX512 It makes no difference on my Ryzen-7950X, but perhaps it will be beneficial for CPU's with real AVX512. --- iqk_mul_mat.cpp | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'iqk_mul_mat.cpp') diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp index 258d77aa..8f0b9816 100644 --- a/iqk_mul_mat.cpp +++ b/iqk_mul_mat.cpp @@ -94,7 +94,6 @@ typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& inf struct MulMat { std::array funcs = {}; - //std::array funcs = {}; inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) { #ifdef __aarch64__ constexpr int k_x_step = 64; //8192; // Tiling does not seem to help on my M2 Max (but difference to tiling is small) @@ -2155,6 +2154,22 @@ struct Q5_1_Unpacker final : public Q_Unpacker struct QF32y final : public QF32Base { constexpr static int nrc_y = nrc; @@ -2188,7 +2204,7 @@ template struct QF32x final : public QF32Base { }; template -IQK_NOINLINE void mul_mat_f16_f32_NxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) { +IQK_NOINLINE void mul_mat_f16_f32_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) { assert(n%QF16Base::k_step == 0); int nb = n/QF32Base::k_step; QF32y y(info); @@ -2228,18 +2244,17 @@ void mul_mat_f16_f32_T(int n, const void * vx, size_t bx, const DataInfo& info, #endif const char * cx = (const char *)vx; for (int ix = 0; ix < nrc_x/k_nx; ++ix) { - mul_mat_f16_f32_NxN(n, cx, bx, ix*k_nx, info); + mul_mat_f16_f32_MxN(n, cx, bx, ix*k_nx, info); } int last_x = k_nx*(nrc_x/k_nx); if (last_x == nrc_x) return; int nx = nrc_x - last_x; switch (nx) { - case 1: mul_mat_f16_f32_NxN(n, cx, bx, last_x, info); break; - case 2: mul_mat_f16_f32_NxN(n, cx, bx, last_x, info); break; - case 3: mul_mat_f16_f32_NxN(n, cx, bx, last_x, info); break; - case 4: mul_mat_f16_f32_NxN(n, cx, bx, last_x, info); break; -#ifndef __AVX512F__ - case 5: mul_mat_f16_f32_NxN(n, cx, bx, last_x, info); break; + case 1: mul_mat_f16_f32_MxN(n, cx, bx, last_x, info); break; +#ifdef __AVX512F__ + case 2: mul_mat_f16_f32_MxN(n, cx, bx, last_x, info); break; + case 3: mul_mat_f16_f32_MxN(n, cx, bx, last_x, info); break; + case 4: mul_mat_f16_f32_MxN(n, cx, bx, last_x, info); break; #endif } } @@ -2394,7 +2409,6 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int mm.funcs[2] = mul_mat_f16_f32_T<3>; mm.funcs[3] = mul_mat_f16_f32_T<4>; mm.funcs[4] = mul_mat_f16_f32_T<5>; - mm.funcs[4] = mul_mat_f16_f32_T<5>; #ifndef __AVX512F__ mm.funcs[5] = mul_mat_f16_f32_T<6>; #endif -- cgit v1.2.3