From 715fc552ad2ea5fad38e7ff856bf84fdb71b692e Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sat, 26 Apr 2025 08:13:25 +0200 Subject: Add support for Cohere2 (#341) * Add support for Cohere2 * Fixe IQ4_NL on AVX2 * Command-A needs fp32 precision for K*Q --------- Co-authored-by: Iwan Kawrakow --- ggml/src/ggml.c | 2 +- ggml/src/iqk/iqk_mul_mat.cpp | 49 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 3 deletions(-) (limited to 'ggml/src') diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ad9393cc..88013f74 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1289,7 +1289,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref, .vec_dot = ggml_vec_dot_iq4_nl_q8_0, #if GGML_USE_IQK_MULMAT -#if defined __AVX2__ +#if defined HAVE_FANCY_SIMD .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 45d804a4..e7ab2e5b 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -1750,6 +1750,15 @@ __m256i inline load_iq4nl_values_256() { return MM256_SET_M128I(val128, val128); } +__m128i inline load_iq4k_values_128() { + return _mm_loadu_si128((const __m128i *)iq4k_values); +} + +__m256i inline load_iq4k_values_256() { + auto val128 = load_iq4k_values_128(); + return MM256_SET_M128I(val128, val128); +} + #ifdef HAVE_FANCY_SIMD //====================================== Zen4 ================================================== @@ -8519,7 +8528,11 @@ struct Q4_0_1_Dequantizer { struct IQ4_NL_Dequantizer { Dequantizer4bit b4; +#ifdef HAVE_FANCY_SIMD const __m256i values = load_iq4nl_values_256(); +#else + const __m256i values = load_iq4k_values_256(); +#endif inline __m256i dequant(const block_iq4_nl * x) const { return _mm256_shuffle_epi8(values, b4.dequant(x->qs)); } @@ -8630,11 +8643,19 @@ struct Q4_0_1_Unpacker final : public Q_Unpacker using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK4_0; } }; +#ifdef HAVE_FANCY_SIMD struct IQ4_NL_Unpacker final : public Q_Unpacker, IQ4_NL_Dequantizer> { IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK4_NL; } }; +#else +struct IQ4_NL_Unpacker final : public Q_Unpacker { + IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + using Sum4T = Sum4TypeQ80; + inline static int block_size() { return QK4_NL; } +}; +#endif struct Q5_0_Unpacker final : public Q_Unpacker { Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} using Sum4T = Sum4TypeQ80; @@ -9155,9 +9176,29 @@ template void MulMat::set_functions(MulMat& m) { m.funcs[6] = mul_mat_qX_1_q8_2_T; m.funcs[7] = mul_mat_qX_1_q8_2_T; } + else if constexpr (std::is_same_v) { +#ifdef HAVE_FANCY_SIMD + m.funcs[0] = mul_mat_qX_1_q8_2_T; + m.funcs[1] = mul_mat_qX_1_q8_2_T; + m.funcs[2] = mul_mat_qX_1_q8_2_T; + m.funcs[3] = mul_mat_qX_1_q8_2_T; + m.funcs[4] = mul_mat_qX_1_q8_2_T; + m.funcs[5] = mul_mat_qX_1_q8_2_T; + m.funcs[6] = mul_mat_qX_1_q8_2_T; + m.funcs[7] = mul_mat_qX_1_q8_2_T; +#else + m.funcs[0] = mul_mat_qX_0_q8_0_T; + m.funcs[1] = mul_mat_qX_0_q8_0_T; + m.funcs[2] = mul_mat_qX_0_q8_0_T; + m.funcs[3] = mul_mat_qX_0_q8_0_T; + m.funcs[4] = mul_mat_qX_0_q8_0_T; + m.funcs[5] = mul_mat_qX_0_q8_0_T; + m.funcs[6] = mul_mat_qX_0_q8_0_T; + m.funcs[7] = mul_mat_qX_0_q8_0_T; +#endif + } else if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v || - std::is_same_v) { + std::is_same_v || std::is_same_v) { m.funcs[0] = mul_mat_qX_1_q8_2_T; m.funcs[1] = mul_mat_qX_1_q8_2_T; m.funcs[2] = mul_mat_qX_1_q8_2_T; @@ -9476,7 +9517,11 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { case GGML_TYPE_IQ4_NL: assert (ne00 % QK4_NL == 0); MulMat::set_functions(mm); +#ifdef HAVE_FANCY_SIMD expected_typeB = GGML_TYPE_Q8_2_X4; +#else + expected_typeB = GGML_TYPE_Q8_0_X4; +#endif break; case GGML_TYPE_IQ4_NL_R4: assert (ne00 % QK4_NL == 0); -- cgit v1.2.3