diff options
-rw-r--r-- | ggml/include/ggml.h | 5 | ||||
-rw-r--r-- | ggml/src/ggml-common.h | 14 | ||||
-rw-r--r-- | ggml/src/ggml.c | 39 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_mul_mat.cpp | 477 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 67 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.h | 1 |
6 files changed, 348 insertions, 255 deletions
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 91219d4a..7cc9100d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -396,8 +396,9 @@ extern "C" { // GGML_TYPE_I2_S = 36, // - GGML_TYPE_Q8_0_X4 = 98, - GGML_TYPE_Q8_1_X4 = 99, + GGML_TYPE_Q8_0_X4 = 97, + GGML_TYPE_Q8_1_X4 = 98, + GGML_TYPE_Q8_2_X4 = 99, GGML_TYPE_Q6_0 = 133, GGML_TYPE_IQ1_BN = 134, GGML_TYPE_IQ2_BN = 135, diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 4308f0b9..59702e32 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -266,6 +266,20 @@ typedef struct { } block_q8_0x8; static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding"); +#define QK8_2 32 +typedef struct { + uint16_t d; + uint16_t s; + int8_t qs[QK8_2]; // quants +} block_q8_2; +static_assert(sizeof(block_q8_2) == sizeof(ggml_half) + sizeof(int16_t) + QK8_2, "wrong q8_2 block size/padding"); + +typedef struct { + uint16_t d[8]; + int8_t qs[4*QK8_2]; +} block_q8_2_x4; +static_assert(sizeof(block_q8_2_x4) == 4*sizeof(block_q8_2), "wrong q8_2_x4 block size/padding"); + // // Super-block quantization structures // diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 036bd8a8..25694fc7 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -717,7 +717,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q4_0_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -741,7 +741,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref, .vec_dot = ggml_vec_dot_q4_1_q8_1, #if GGML_USE_IQK_MULMAT - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_1, #endif @@ -789,7 +789,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q5_0_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -809,7 +809,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref, .vec_dot = ggml_vec_dot_q5_1_q8_1, #if GGML_USE_IQK_MULMAT - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_1, #endif @@ -827,7 +827,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_q6_0_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -852,7 +852,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { // Remember: we cannot add 128 to the Q8 quants and use iblock sum in Q8_1 to subtract as we do on Zen4 for pure AVX2 // because there the result of the _mm256_maddubs_epi16() instruction may overflow the int16_t range // (and it gets satured if it does), leading to wrong results. - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -897,6 +897,16 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .nrows = 1, .row_meta_size = 0, }, + [GGML_TYPE_Q8_2_X4] = { + .type_name = "q8_2_x4", + .blck_size = QK8_2, + .type_size = sizeof(block_q8_2), + .is_quantized = true, + .from_float = quantize_row_q8_2_x4, + .from_float_ref = quantize_row_q8_2_x4, + .nrows = 1, + .row_meta_size = 0, + }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", .blck_size = QK_K, @@ -1272,7 +1282,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq4_nl_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -1628,7 +1638,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_iq4_nl_r4_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -1662,7 +1672,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_q4_0_r8_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -1683,7 +1693,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_q8_0_r8_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -1704,7 +1714,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_q5_0_r4_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -1725,7 +1735,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = vec_dot_q6_0_r4_q8_0, #if GGML_USE_IQK_MULMAT #if defined __AVX2__ - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif @@ -11647,6 +11657,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_0_X4: case GGML_TYPE_Q8_1_X4: + case GGML_TYPE_Q8_2_X4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K_R4: case GGML_TYPE_Q3_K: @@ -11815,6 +11826,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_0_X4: case GGML_TYPE_Q8_1_X4: + case GGML_TYPE_Q8_2_X4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K_R4: case GGML_TYPE_Q3_K: @@ -15690,6 +15702,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_0_X4: case GGML_TYPE_Q8_1_X4: + case GGML_TYPE_Q8_2_X4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K_R4: case GGML_TYPE_Q3_K: @@ -15997,6 +16010,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_0_X4: case GGML_TYPE_Q8_1_X4: + case GGML_TYPE_Q8_2_X4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K_R4: case GGML_TYPE_Q3_K: @@ -16627,6 +16641,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_0_X4: case GGML_TYPE_Q8_1_X4: + case GGML_TYPE_Q8_2_X4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K_R4: case GGML_TYPE_Q3_K: diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 4d29e2f0..cf512ba5 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -3436,9 +3436,9 @@ static void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const Da #ifdef HAVE_FANCY_SIMD template <int nrc_y> -static void mul_mat_iq4_nl_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_iq4_nl_r4_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%8 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_2_x4> q8(info); auto m4 = _mm512_set1_epi8(0xf); auto values = load_iq4nl_values_512(); int nb = n / QK4_NL; @@ -3475,7 +3475,8 @@ static void mul_mat_iq4_nl_r4_q8_1(int n, const void * vx, size_t bx, const Data const block_iq4_nl_r4 * iq4h = (const block_iq4_nl_r4 *)((const char *)vx + (ix+4)*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - _mm256_storeu_ps(d8+8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d))); + auto aux = _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)), 16); + _mm256_storeu_ps(d8+8*iy, _mm256_castsi256_ps(aux)); } for (int k = 0; k < 4; ++k) { auto scales = prepare(iq4l[4*ib4+k], iq4h[4*ib4+k]); @@ -3492,9 +3493,10 @@ static void mul_mat_iq4_nl_r4_q8_1(int n, const void * vx, size_t bx, const Data for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs)); - auto dy = _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)); + ggml_bf16_t d, s; d.bits = qy[ib].d; s.bits = qy[ib].s; + auto dy = _mm512_set1_ps(GGML_BF16_TO_FP32(d)); acc[2*iy+0] = _mm512_fmadd_ps(_mm512_mul_ps(scales, dy), _mm512_cvtepi32_ps(sumi), acc[2*iy+0]); - acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].s)), acc[2*iy+1]); + acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_BF16_TO_FP32(s)), acc[2*iy+1]); } } for (int iy = 0; iy < nrc_y; ++iy) { @@ -3509,9 +3511,9 @@ static void mul_mat_iq4_nl_r4_q8_1(int n, const void * vx, size_t bx, const Data } #else template <int nrc_y> -static void mul_mat_iq4_nl_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_iq4_nl_r4_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_2_x4> q8(info); auto m4 = _mm256_set1_epi8(0xf); auto m1 = _mm256_set1_epi16(1); auto values128 = _mm_loadu_si128((const __m128i *)iq4k_values); @@ -3548,7 +3550,8 @@ static void mul_mat_iq4_nl_r4_q8_1(int n, const void * vx, size_t bx, const Data const block_iq4_nl_r4 * iq4 = (const block_iq4_nl_r4 *)((const char *)vx + ix*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - _mm_storeu_ps(d8+4*iy, _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib4].d))); + auto aux = _mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib4].d)), 16); + _mm_storeu_ps(d8+4*iy, _mm_castsi128_ps(aux)); } for (int k = 0; k < 4; ++k) { auto scales = prepare(iq4[4*ib4+k]); @@ -3564,7 +3567,8 @@ static void mul_mat_iq4_nl_r4_q8_1(int n, const void * vx, size_t bx, const Data for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs)); - auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d))); + ggml_bf16_t d{qy[ib].d}; + auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d))); acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]); } } @@ -3623,7 +3627,7 @@ inline __m256i accum_q4_0_quants(const __m256i * v, const int8_t * qs) { } template <int nrc_y> -static void mul_mat_q4_0_r8_q8_1_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_q4_0_r8_q8_2_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%8 == 0); Q8<nrc_y, block_q8_1_x4> q8(info); auto m4 = _mm256_set1_epi8(0xf); @@ -3637,7 +3641,7 @@ static void mul_mat_q4_0_r8_q8_1_avx2(int n, const void * vx, size_t bx, const D auto acc1 = _mm256_setzero_ps(); auto acc2 = _mm256_setzero_ps(); for (int ib4 = 0; ib4 < nb/4; ++ib4) { - helper.vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[0][ib4].d)); + helper.vec = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[0][ib4].d)), 16)); for (int k = 0; k < 4; ++k) { auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq4[4*ib4+k].d)); prepare_q4_0_quants_avx2(iq4[4*ib4+k].qs, v, m4); @@ -3652,9 +3656,10 @@ static void mul_mat_q4_0_r8_q8_1_avx2(int n, const void * vx, size_t bx, const D auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq4[ib].d)); prepare_q4_0_quants_avx2(iq4[ib].qs, v, m4); auto sumi = accum_q4_0_quants(v, qy[ib].qs); - auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d))); + ggml_bf16_t d{qy[ib].d}, s{qy[ib].s}; + auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d))); acc1 = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc1); - acc2 = _mm256_fmadd_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].s)), acc2); + acc2 = _mm256_fmadd_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(s)), acc2); } acc1 = _mm256_fmadd_ps(acc2, _mm256_set1_ps(-8.f), acc1); info.store(ix, 0, acc1); @@ -3672,7 +3677,7 @@ static void mul_mat_q4_0_r8_q8_1_avx2(int n, const void * vx, size_t bx, const D d4[k] = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq4[4*ib4+k].d)); } for (int iy = 0; iy < nrc_y; ++iy) { - auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)); + auto scales = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)), 16)); _mm256_storeu_ps(d8 + 8*iy, scales); auto m4 = _mm256_extractf128_ps(scales, 1); auto m8 = _mm256_set_m128(m4, m4); @@ -3700,9 +3705,10 @@ static void mul_mat_q4_0_r8_q8_1_avx2(int n, const void * vx, size_t bx, const D for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = accum_q4_0_quants(v, qy[ib].qs); - auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d))); + ggml_bf16_t d{qy[ib].d}, s{qy[ib].s}; + auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d))); acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]); - acc[iy] = _mm256_fmadd_ps(scales_m, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].s)), acc[iy]); + acc[iy] = _mm256_fmadd_ps(scales_m, _mm256_set1_ps(GGML_BF16_TO_FP32(s)), acc[iy]); } } for (int iy = 0; iy < nrc_y; ++iy) { @@ -3977,9 +3983,9 @@ static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataI #ifdef HAVE_FANCY_SIMD template <int nrc_y> -static void mul_mat_q4_0_r8_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_q4_0_r8_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { if constexpr (nrc_y == 1) { - mul_mat_q4_0_r8_q8_1_avx2<1>(n, vx, bx, info, nrc_x); + mul_mat_q4_0_r8_q8_2_avx2<1>(n, vx, bx, info, nrc_x); return; } GGML_ASSERT(nrc_x%16 == 0); @@ -4024,7 +4030,8 @@ static void mul_mat_q4_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn const block_iq4_nl_r8 * iq4h = (const block_iq4_nl_r8 *)((const char *)vx + (ix+8)*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - _mm256_storeu_ps(d8+8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d))); + auto aux = _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)), 16); + _mm256_storeu_ps(d8+8*iy, _mm256_castsi256_ps(aux)); } for (int k = 0; k < 4; ++k) { auto scales = prepare(iq4l[4*ib4+k], iq4h[4*ib4+k]); @@ -4041,9 +4048,10 @@ static void mul_mat_q4_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = dot(qy[ib].qs); - auto dy = _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)); + ggml_bf16_t d{qy[ib].d}, s{qy[ib].s}; + auto dy = _mm512_set1_ps(GGML_BF16_TO_FP32(d)); acc[2*iy+0] = _mm512_fmadd_ps(_mm512_mul_ps(scales, dy), _mm512_cvtepi32_ps(sumi), acc[2*iy+0]); - acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].s)), acc[2*iy+1]); + acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_BF16_TO_FP32(s)), acc[2*iy+1]); } } for (int iy = 0; iy < nrc_y; ++iy) { @@ -4055,15 +4063,15 @@ static void mul_mat_q4_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn } #else template <int nrc_y> -static void mul_mat_q4_0_r8_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - mul_mat_q4_0_r8_q8_1_avx2<nrc_y>(n, vx, bx, info, nrc_x); +static void mul_mat_q4_0_r8_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + mul_mat_q4_0_r8_q8_2_avx2<nrc_y>(n, vx, bx, info, nrc_x); } #endif template <int nrc_y> -static void mul_mat_q5_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_q5_0_r4_q8_2_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_2_x4> q8(info); auto m4 = _mm256_set1_epi8(0xf); auto m5 = _mm256_set1_epi8(0x10); #ifndef HAVE_FANCY_SIMD @@ -4110,7 +4118,7 @@ static void mul_mat_q5_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D const block_q5_0_r4 * iq5 = (const block_q5_0_r4 *)((const char *)vx + ix*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)); + auto scales = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)), 16)); _mm256_storeu_ps(d8 + 8*iy, _mm256_mul_ps(mscale, scales)); } for (int k = 0; k < 4; ++k) { @@ -4128,9 +4136,10 @@ static void mul_mat_q5_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs)); - auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d))); + ggml_bf16_t d{qy[ib].d}, s{qy[ib].s}; + auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d))); acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]); - acc[iy] = _mm256_fmadd_ps(scales, _mm256_set1_ps(-8.f*GGML_FP16_TO_FP32(qy[ib].s)), acc[iy]); + acc[iy] = _mm256_fmadd_ps(scales, _mm256_set1_ps(-8.f*GGML_BF16_TO_FP32(s)), acc[iy]); } } for (int iy = 0; iy < nrc_y; ++iy) { @@ -4143,12 +4152,12 @@ static void mul_mat_q5_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D #ifdef HAVE_FANCY_SIMD template <int nrc_y> -static void mul_mat_q5_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_q5_0_r4_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { if constexpr (nrc_y == 1) { - mul_mat_q5_0_r4_q8_1_avx2<1>(n, vx, bx, info, nrc_x); + mul_mat_q5_0_r4_q8_2_avx2<1>(n, vx, bx, info, nrc_x); } else { GGML_ASSERT(nrc_x%8 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_2_x4> q8(info); auto m4 = _mm512_set1_epi8(0xf); auto m5 = _mm512_set1_epi8(0x10); int nb = n / QK5_0; @@ -4190,7 +4199,7 @@ static void mul_mat_q5_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn const block_q5_0_r4 * iq5h = (const block_q5_0_r4 *)((const char *)vx + (ix+4)*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - _mm256_storeu_ps(d8+8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d))); + _mm256_storeu_ps(d8+8*iy, _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)), 16))); } for (int k = 0; k < 4; ++k) { auto scales = prepare(iq5l[4*ib4+k], iq5h[4*ib4+k]); @@ -4207,9 +4216,10 @@ static void mul_mat_q5_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs)); - auto dy = _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)); + ggml_bf16_t d{qy[ib].d}, s{qy[ib].s}; + auto dy = _mm512_set1_ps(GGML_BF16_TO_FP32(d)); acc[2*iy+0] = _mm512_fmadd_ps(_mm512_mul_ps(scales, dy), _mm512_cvtepi32_ps(sumi), acc[2*iy+0]); - acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].s)), acc[2*iy+1]); + acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_BF16_TO_FP32(s)), acc[2*iy+1]); } } for (int iy = 0; iy < nrc_y; ++iy) { @@ -4225,15 +4235,15 @@ static void mul_mat_q5_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn } #else template <int nrc_y> -static void mul_mat_q5_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - mul_mat_q5_0_r4_q8_1_avx2<nrc_y>(n, vx, bx, info, nrc_x); +static void mul_mat_q5_0_r4_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + mul_mat_q5_0_r4_q8_2_avx2<nrc_y>(n, vx, bx, info, nrc_x); } #endif template <int nrc_y> -static void mul_mat_q6_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_q6_0_r4_q8_2_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_2_x4> q8(info); auto m4 = _mm256_set1_epi8(0xf); auto m6 = _mm256_set1_epi8(0x30); auto mscale = _mm256_set_m128(_mm_set1_ps(-16.f), _mm_set1_ps(1.f)); @@ -4278,7 +4288,7 @@ static void mul_mat_q6_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D const block_q6_0_r4 * iq6 = (const block_q6_0_r4 *)((const char *)vx + ix*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)); + auto scales = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)), 16)); _mm256_storeu_ps(d8 + 8*iy, _mm256_mul_ps(scales, mscale)); } for (int k = 0; k < 4; ++k) { @@ -4296,9 +4306,10 @@ static void mul_mat_q6_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs)); - auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d))); + ggml_bf16_t d{qy[ib].d}, s{qy[ib].s}; + auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d))); acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]); - acc[iy] = _mm256_fmadd_ps(scales, _mm256_set1_ps(-16.f*GGML_FP16_TO_FP32(qy[ib].s)), acc[iy]); + acc[iy] = _mm256_fmadd_ps(scales, _mm256_set1_ps(-16.f*GGML_BF16_TO_FP32(s)), acc[iy]); } } @@ -4312,12 +4323,12 @@ static void mul_mat_q6_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D #ifdef HAVE_FANCY_SIMD template <int nrc_y> -static void mul_mat_q6_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_q6_0_r4_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { if constexpr (nrc_y == 1) { - mul_mat_q6_0_r4_q8_1_avx2<1>(n, vx, bx, info, nrc_x); + mul_mat_q6_0_r4_q8_2_avx2<1>(n, vx, bx, info, nrc_x); } else { GGML_ASSERT(nrc_x%8 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_2_x4> q8(info); auto m4 = _mm512_set1_epi8(0xf); auto m6 = _mm512_set1_epi8(0x30); int nb = n / QK6_0; @@ -4357,7 +4368,7 @@ static void mul_mat_q6_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn const block_q6_0_r4 * iq6h = (const block_q6_0_r4 *)((const char *)vx + (ix+4)*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)); + auto scales = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)), 16)); _mm256_storeu_ps(d8 + 8*iy, scales); } for (int k = 0; k < 4; ++k) { @@ -4375,9 +4386,10 @@ static void mul_mat_q6_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs)); - auto dy = _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)); + ggml_bf16_t d{qy[ib].d}, s{qy[ib].s}; + auto dy = _mm512_set1_ps(GGML_BF16_TO_FP32(d)); acc[2*iy+0] = _mm512_fmadd_ps(_mm512_mul_ps(scales, dy), _mm512_cvtepi32_ps(sumi), acc[2*iy+0]); - acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].s)), acc[2*iy+1]); + acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_BF16_TO_FP32(s)), acc[2*iy+1]); } } for (int iy = 0; iy < nrc_y; ++iy) { @@ -4393,8 +4405,8 @@ static void mul_mat_q6_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn } #else template <int nrc_y> -static void mul_mat_q6_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - mul_mat_q6_0_r4_q8_1_avx2<nrc_y>(n, vx, bx, info, nrc_x); +static void mul_mat_q6_0_r4_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + mul_mat_q6_0_r4_q8_2_avx2<nrc_y>(n, vx, bx, info, nrc_x); } #endif @@ -4437,20 +4449,12 @@ inline __m256i q8_0_r8_dot_product(const uint8_t * x, const int8_t * y, __m256i for (int i = 0; i < 8; ++i) { qx[i] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)x+i), _mm256_set1_epi8(127)); } - //qx[0] = _mm256_loadu_si256((const __m256i *)x+0); - //qx[1] = _mm256_loadu_si256((const __m256i *)x+1); - //qx[2] = _mm256_loadu_si256((const __m256i *)x+2); - //qx[3] = _mm256_loadu_si256((const __m256i *)x+3); - //qx[4] = _mm256_loadu_si256((const __m256i *)x+4); - //qx[5] = _mm256_loadu_si256((const __m256i *)x+5); - //qx[6] = _mm256_loadu_si256((const __m256i *)x+6); - //qx[7] = _mm256_loadu_si256((const __m256i *)x+7); return qx_r8_q8_dot_product(qx, y); } template <int nrc_y> -static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%16 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_2_x4> q8(info); int nb = n / QK8_0; if constexpr (nrc_y == 1) { __m256 acc[2] = {}; @@ -4459,7 +4463,8 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn for (int ix = 0; ix < nrc_x; ix += 8) { const block_q8_0_r8 * iq8 = (const block_q8_0_r8 *)((const char *)vx + ix*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { - _mm256_storeu_ps(d8, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[0][ib4].d))); + auto aux = _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[0][ib4].d)), 16); + _mm256_storeu_ps(d8, _mm256_castsi256_ps(aux)); for (int k = 0; k < 4; ++k) { auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq8[4*ib4+k].d)); auto sumi = q8_0_r8_dot_product((const uint8_t *)iq8[4*ib4+k].qs, q8.y[0][ib4].qs+32*k, qx); @@ -4473,9 +4478,10 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn for (int ib = 4*(nb/4); ib < nb; ++ib) { auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq8[ib].d)); auto sumi = q8_0_r8_dot_product((const uint8_t *)iq8[ib].qs, qy[ib].qs, qx); - auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d))); + ggml_bf16_t d, s; d.bits = qy[ib].d; s.bits = qy[ib].s; + auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d))); acc[0] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[0]); - acc[1] = _mm256_fmadd_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].s)), acc[1]); + acc[1] = _mm256_fmadd_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(s)), acc[1]); } } info.store(ix, 0, _mm256_fmadd_ps(_mm256_set1_ps(-127.f), acc[1], acc[0])); @@ -4490,7 +4496,8 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn const block_q8_0_r8 * q8h = (const block_q8_0_r8 *)((const char *)vx + (ix+8)*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - _mm256_storeu_ps(d8+8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d))); + auto aux = _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)q8.y[iy][ib4].d)), 16); + _mm256_storeu_ps(d8+8*iy, _mm256_castsi256_ps(aux)); } for (int k = 0; k < 4; ++k) { auto scales1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8l[4*ib4+k].d)); @@ -4521,9 +4528,10 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn for (int iy = 0; iy < nrc_y; ++iy) { auto qy = (const block_q8_1 *)q8.y[iy]; auto sumi = qx_r8_q8_dot_product(qx, qy[ib].qs); - auto dy = _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)); + ggml_bf16_t d, s; d.bits = qy[ib].d; s.bits = qy[ib].s; + auto dy = _mm512_set1_ps(GGML_BF16_TO_FP32(d)); acc[2*iy+0] = _mm512_fmadd_ps(_mm512_mul_ps(scales, dy), _mm512_cvtepi32_ps(sumi), acc[2*iy+0]); - acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_FP16_TO_FP32(qy[ib].s)), acc[2*iy+1]); + acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_BF16_TO_FP32(s)), acc[2*iy+1]); } } for (int iy = 0; iy < nrc_y; ++iy) { @@ -4536,9 +4544,9 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn } #else template <int nrc_y> -static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%8 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_2_x4> q8(info); auto m1 = _mm256_set1_epi16(1); int nb = n / QK8_0; __m256 acc[nrc_y] = {}; @@ -4561,7 +4569,7 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn const block_q8_0_r8 * iq8 = (const block_q8_0_r8 *)((const char *)vx + ix*bx); for (int ib4 = 0; ib4 < nb/4; ++ib4) { for (int iy = 0; iy < nrc_y; ++iy) { - auto scales = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib4].d)); + auto scales = _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib4].d)), 16)); _mm_storeu_ps(d8 + 4*iy, scales); } for (int k = 0; k < 4; ++k) { @@ -4593,9 +4601,9 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn sx[j] = _mm256_sign_epi8(qx[j], qx[j]); } for (int iy = 0; iy < nrc_y; ++iy) { - auto qy = (const block_q8_1 *)q8.y[iy]; + auto qy = (const block_q8_2 *)q8.y[iy]; auto sumi = dot(qy[ib].qs); - auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d))); + auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(ggml_bf16_t{qy[ib].d}))); acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]); } for (int j = 0; j < 4; ++j) { @@ -4603,9 +4611,9 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn sx[j] = _mm256_sign_epi8(qx[j], qx[j]); } for (int iy = 0; iy < nrc_y; ++iy) { - auto qy = (const block_q8_1 *)q8.y[iy]; + auto qy = (const block_q8_2 *)q8.y[iy]; auto sumi = dot(qy[ib].qs+16); - auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d))); + auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(ggml_bf16_t{qy[ib].d}))); acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]); } } @@ -8199,6 +8207,29 @@ struct ScaleHelperQ_0_1 { const __m128 min = _mm_set1_ps(float(-min_value)); }; +//template <int min_value> +//struct ScaleHelperQ_0_2 { +// ggml_bf16_t scales8[4]; +// template <typename Q> +// inline __m256 prepare4(const Q * y) { +// for (int j = 0; j < 4; ++j) scales8[j] = y[j].d; +// auto s4 = _mm_castsi128_ps(_mm_slli_epi16(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales8)), 16)); +// return _mm256_set_m128(_mm_mul_ps(s4, min), s4); +// } +// template <typename Q> +// inline __m256 prepare4(__m256 other_scales, const Q * y) { +// return _mm_mul256_ps(other_scales, prepare4<Q>(y)); +// } +// template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const { +// float d = GGML_BF16_TO_FP32(y->d); +// return std::make_pair(d, -d*float(min_value)); +// } +// std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const { +// return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s)); +// } +// const __m128 min = _mm_set1_ps(float(-min_value)); +//}; + struct ScaleHelperQ8_1 { template <typename Q> inline __m256 prepare4(const Q * y) { @@ -8220,6 +8251,30 @@ struct ScaleHelperQ8_1 { } }; +struct ScaleHelperQ8_2 { + template <typename Q> + inline __m256 prepare4(const Q * y) { + const block_q8_2_x4 * y4 = (const block_q8_2_x4 *)y; + auto aux = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)y4->d)); + return _mm256_castsi256_ps(_mm256_slli_epi32(aux, 16)); + } + template <typename Q> + inline __m256 prepare4(__m256 other_scales, const Q * y) { + return _mm256_mul_ps(other_scales, prepare4<Q>(y)); + } + template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const { + return std::make_pair(GGML_BF16_TO_FP32(y->d), GGML_BF16_TO_FP32(y->m)); + } + template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const { + ggml_bf16_t d, s; d.bits = y->d; s.bits = y->s; + return std::make_pair(dm.first*GGML_BF16_TO_FP32(d), dm.second*GGML_BF16_TO_FP32(s)); + } + std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_2 * y) const { + ggml_bf16_t d, s; d.bits = y->d; s.bits = y->s; + return std::make_pair(dm.first*GGML_BF16_TO_FP32(d), dm.second*GGML_BF16_TO_FP32(s)); + } +}; + struct ScaleHelperQ_1 { uint32_t scales8[4]; const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100); @@ -8320,7 +8375,8 @@ using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>; using Sum4Type0 = Sum4<block_q8_0, block_q8_0_x4, SignedDot>; using Sum4Type1 = Sum4<block_q8_1, block_q8_1_x4, UnsignedDot>; using Sum4TypeQ80 = Sum4<block_q8_0, block_q8_0_x4, SignedDot, false>; -using Sum4TypeQ81 = Sum4<block_q8_1, block_q8_1_x4, UnsignedDot, false>; +//using Sum4TypeQ81 = Sum4<block_q8_1, block_q8_1_x4, UnsignedDot, false>; +using Sum4TypeQ82 = Sum4<block_q8_2, block_q8_2_x4, UnsignedDot, false>; template <typename Unpacker, typename AccumType, typename Scales, typename Q8, int nrc_y> void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) { @@ -8366,6 +8422,22 @@ void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info } } +template <typename Unpacker, int nrc_y> +void mul_mat_qX_1_q8_2_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%Unpacker::block_size() == 0); + Q8<nrc_y, block_q8_2> q8(info); + int nb = n/Unpacker::block_size(); + if (nb%4 == 0) { + mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, true>, ScaleHelperQ8_2, block_q8_2, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } else { + mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, false>, ScaleHelperQ8_2, block_q8_2, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } +} + struct Dequantizer4bit { const __m256i m4 = _mm256_set1_epi8(0xf); inline __m256i dequant(const uint8_t * qs) const { @@ -8494,73 +8566,6 @@ struct Q_Unpacker { } }; -struct Q8_0_x4_Unpacker_256 { - using Sum4T = Sum4TypeQ80; - inline static int block_size() { return QK8_0; } - Q8_0_x4_Unpacker_256(const void * vx, size_t bx) : cx_0((const char *)vx), x((const block_q8_0_x4 *)cx_0), bx(bx) {} - - const char * cx_0; - const block_q8_0_x4 * x; - size_t bx; - - __m256i qx[4]; - - inline const __m256i* quants() const { return qx; } - - inline void set_row(int ix) { x = (const block_q8_0_x4 *)(cx_0 + ix*bx); } - - inline auto set_block_4(int i) { - auto scales = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x[i].d)); - for (int j = 0; j < 4; ++j) { - qx[j] = _mm256_loadu_si256((const __m256i *)x[i].qs + j); - } - return scales; - } - inline auto set_block(int i) { - auto q8 = (const block_q8_0 *)(x + i); - qx[0] = _mm256_loadu_si256((const __m256i *)q8->qs); - return GGML_FP16_TO_FP32(q8->d); - } -}; - -#ifdef HAVE_FANCY_SIMD -struct Q8_0_x4_Unpacker_512 { - using Sum4T = Sum4TypeQ81; - inline static int block_size() { return QK8_0; } - Q8_0_x4_Unpacker_512(const void * vx, size_t bx) : cx_0((const char *)vx), x((const block_q8_0_x4 *)cx_0), bx(bx) {} - - const char * cx_0; - const block_q8_0_x4 * x; - size_t bx; - const __m128 min = _mm_set1_ps(-128.f); - - __m256i qx[4]; - - inline const __m256i* quants() const { return qx; } - - inline void set_row(int ix) { x = (const block_q8_0_x4 *)(cx_0 + ix*bx); } - - inline auto set_block_4(int i) { - auto scales = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x[i].d)); - for (int j = 0; j < 4; ++j) { - qx[j] = _mm256_loadu_si256((const __m256i *)x[i].qs + j); - qx[j] = _mm256_xor_si256(qx[j], _mm256_set1_epi8(-128)); - } - return _mm256_set_m128(_mm_mul_ps(scales, min), scales); - } - inline auto set_block(int i) { - auto q8 = (const block_q8_0 *)(x + i); - qx[0] = _mm256_loadu_si256((const __m256i *)q8->qs); - qx[0] = _mm256_xor_si256(qx[0], _mm256_set1_epi8(-128)); - float d = GGML_FP16_TO_FP32(q8->d); - return std::make_pair(d, -128.f*d); - } -}; -using Q8_0_x4_Unpacker = Q8_0_x4_Unpacker_512; -#else -using Q8_0_x4_Unpacker = Q8_0_x4_Unpacker_256; -#endif - struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> { Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} using Sum4T = Sum4TypeQ80; @@ -8568,7 +8573,7 @@ struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_ }; struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> { Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4TypeQ81; + using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK8_0; } }; struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> { @@ -8578,12 +8583,12 @@ struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_ }; struct Q4_0_1_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0_1<8>, Q4_0_1_Dequantizer> { Q4_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4TypeQ81; + using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK4_0; } }; struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0_1<128>, IQ4_NL_Dequantizer> { IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4TypeQ81; + using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK4_NL; } }; struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> { @@ -8593,22 +8598,22 @@ struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_ }; struct Q5_0_1_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0_1<16>, Q5_1_Dequantizer<block_q5_0>> { Q5_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4TypeQ81; + using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK5_0; } }; struct Q4_1_Unpacker final : public Q_Unpacker<block_q4_1, ScaleHelperQ_1, Q4_1_Dequantizer> { Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4Type1; + using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK4_1; } }; struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_Dequantizer<block_q5_1>> { Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4Type1; - inline static int block_size() { return QK4_1; } + using Sum4T = Sum4TypeQ82; + inline static int block_size() { return QK5_1; } }; struct Q6_0_1_Unpacker final : public Q_Unpacker<block_q6_0, ScaleHelperQ_0_1<32>, Q6_0_1_Dequantizer> { Q6_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4TypeQ81; + using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK6_0; } }; @@ -9096,18 +9101,27 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) { m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>; m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>; } - else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker> || - std::is_same_v<Dequantizer, Q8_0_1_Unpacker> || std::is_same_v<Dequantizer, Q4_0_1_Unpacker> || + else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) { + m.funcs[0] = mul_mat_qX_1_q8_2_T<Dequantizer, 1>; + m.funcs[1] = mul_mat_qX_1_q8_2_T<Dequantizer, 2>; + m.funcs[2] = mul_mat_qX_1_q8_2_T<Dequantizer, 3>; + m.funcs[3] = mul_mat_qX_1_q8_2_T<Dequantizer, 4>; + m.funcs[4] = mul_mat_qX_1_q8_2_T<Dequantizer, 5>; + m.funcs[5] = mul_mat_qX_1_q8_2_T<Dequantizer, 6>; + m.funcs[6] = mul_mat_qX_1_q8_2_T<Dequantizer, 7>; + m.funcs[7] = mul_mat_qX_1_q8_2_T<Dequantizer, 8>; + } + else if constexpr (std::is_same_v<Dequantizer, Q8_0_1_Unpacker> || std::is_same_v<Dequantizer, Q4_0_1_Unpacker> || std::is_same_v<Dequantizer, Q5_0_1_Unpacker> || std::is_same_v<Dequantizer, IQ4_NL_Unpacker> || std::is_same_v<Dequantizer, Q6_0_1_Unpacker>) { - m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>; - m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>; - m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>; - m.funcs[3] = mul_mat_qX_1_q8_1_T<Dequantizer, 4>; - m.funcs[4] = mul_mat_qX_1_q8_1_T<Dequantizer, 5>; - m.funcs[5] = mul_mat_qX_1_q8_1_T<Dequantizer, 6>; - m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>; - m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>; + m.funcs[0] = mul_mat_qX_1_q8_2_T<Dequantizer, 1>; + m.funcs[1] = mul_mat_qX_1_q8_2_T<Dequantizer, 2>; + m.funcs[2] = mul_mat_qX_1_q8_2_T<Dequantizer, 3>; + m.funcs[3] = mul_mat_qX_1_q8_2_T<Dequantizer, 4>; + m.funcs[4] = mul_mat_qX_1_q8_2_T<Dequantizer, 5>; + m.funcs[5] = mul_mat_qX_1_q8_2_T<Dequantizer, 6>; + m.funcs[6] = mul_mat_qX_1_q8_2_T<Dequantizer, 7>; + m.funcs[7] = mul_mat_qX_1_q8_2_T<Dequantizer, 8>; } else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ3S> || std::is_same_v<Dequantizer, DequantizerIQ3XXS> || std::is_same_v<Dequantizer, DequantizerIQ2S> || std::is_same_v<Dequantizer, DequantizerIQ2XS> || @@ -9383,33 +9397,33 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { case GGML_TYPE_Q4_0: assert (ne00 % QK4_0 == 0); MulMat::set_functions<Q4_0_1_Unpacker>(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_Q4_1: assert (ne00 % QK4_1 == 0); MulMat::set_functions<Q4_1_Unpacker>(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_Q5_0: assert (ne00 % QK5_0 == 0); MulMat::set_functions<Q5_0_1_Unpacker>(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_Q5_1: assert (ne00 % QK5_1 == 0); MulMat::set_functions<Q5_1_Unpacker>(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_Q6_0: assert (ne00 % QK6_0 == 0); MulMat::set_functions<Q6_0_1_Unpacker>(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_Q8_0: assert (ne00 % QK8_0 == 0); #ifdef HAVE_FANCY_SIMD MulMat::set_functions<Q8_0_1_Unpacker>(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_2_X4; #else MulMat::set_functions<Q8_0_Unpacker>(mm); expected_typeB = GGML_TYPE_Q8_0_X4; @@ -9418,19 +9432,19 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { case GGML_TYPE_IQ4_NL: assert (ne00 % QK4_NL == 0); MulMat::set_functions<IQ4_NL_Unpacker>(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_IQ4_NL_R4: assert (ne00 % QK4_NL == 0); - mm.funcs[0] = mul_mat_iq4_nl_r4_q8_1<1>; - mm.funcs[1] = mul_mat_iq4_nl_r4_q8_1<2>; - mm.funcs[2] = mul_mat_iq4_nl_r4_q8_1<3>; - mm.funcs[3] = mul_mat_iq4_nl_r4_q8_1<4>; - mm.funcs[4] = mul_mat_iq4_nl_r4_q8_1<5>; - mm.funcs[5] = mul_mat_iq4_nl_r4_q8_1<6>; - mm.funcs[6] = mul_mat_iq4_nl_r4_q8_1<7>; - mm.funcs[7] = mul_mat_iq4_nl_r4_q8_1<8>; - expected_typeB = GGML_TYPE_Q8_1_X4; + mm.funcs[0] = mul_mat_iq4_nl_r4_q8_2<1>; + mm.funcs[1] = mul_mat_iq4_nl_r4_q8_2<2>; + mm.funcs[2] = mul_mat_iq4_nl_r4_q8_2<3>; + mm.funcs[3] = mul_mat_iq4_nl_r4_q8_2<4>; + mm.funcs[4] = mul_mat_iq4_nl_r4_q8_2<5>; + mm.funcs[5] = mul_mat_iq4_nl_r4_q8_2<6>; + mm.funcs[6] = mul_mat_iq4_nl_r4_q8_2<7>; + mm.funcs[7] = mul_mat_iq4_nl_r4_q8_2<8>; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_IQ4_XS_R8: assert (ne00 % QK_K == 0); @@ -9685,54 +9699,54 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { break; case GGML_TYPE_Q4_0_R8: assert (ne00 % QK4_NL == 0); - mm.funcs[0] = mul_mat_q4_0_r8_q8_1<1>; - mm.funcs[1] = mul_mat_q4_0_r8_q8_1<2>; - mm.funcs[2] = mul_mat_q4_0_r8_q8_1<3>; - mm.funcs[3] = mul_mat_q4_0_r8_q8_1<4>; - mm.funcs[4] = mul_mat_q4_0_r8_q8_1<5>; - mm.funcs[5] = mul_mat_q4_0_r8_q8_1<6>; - mm.funcs[6] = mul_mat_q4_0_r8_q8_1<7>; - mm.funcs[7] = mul_mat_q4_0_r8_q8_1<8>; + mm.funcs[0] = mul_mat_q4_0_r8_q8_2<1>; + mm.funcs[1] = mul_mat_q4_0_r8_q8_2<2>; + mm.funcs[2] = mul_mat_q4_0_r8_q8_2<3>; + mm.funcs[3] = mul_mat_q4_0_r8_q8_2<4>; + mm.funcs[4] = mul_mat_q4_0_r8_q8_2<5>; + mm.funcs[5] = mul_mat_q4_0_r8_q8_2<6>; + mm.funcs[6] = mul_mat_q4_0_r8_q8_2<7>; + mm.funcs[7] = mul_mat_q4_0_r8_q8_2<8>; #ifdef HAVE_FANCY_SIMD - mm.func16 = mul_mat_q4_0_r8_q8_1<16>; + mm.func16 = mul_mat_q4_0_r8_q8_2<16>; #endif - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_Q5_0_R4: assert (ne00 % QK4_NL == 0); - mm.funcs[0] = mul_mat_q5_0_r4_q8_1<1>; - mm.funcs[1] = mul_mat_q5_0_r4_q8_1<2>; - mm.funcs[2] = mul_mat_q5_0_r4_q8_1<3>; - mm.funcs[3] = mul_mat_q5_0_r4_q8_1<4>; - mm.funcs[4] = mul_mat_q5_0_r4_q8_1<5>; - mm.funcs[5] = mul_mat_q5_0_r4_q8_1<6>; - mm.funcs[6] = mul_mat_q5_0_r4_q8_1<7>; - mm.funcs[7] = mul_mat_q5_0_r4_q8_1<8>; - expected_typeB = GGML_TYPE_Q8_1_X4; + mm.funcs[0] = mul_mat_q5_0_r4_q8_2<1>; + mm.funcs[1] = mul_mat_q5_0_r4_q8_2<2>; + mm.funcs[2] = mul_mat_q5_0_r4_q8_2<3>; + mm.funcs[3] = mul_mat_q5_0_r4_q8_2<4>; + mm.funcs[4] = mul_mat_q5_0_r4_q8_2<5>; + mm.funcs[5] = mul_mat_q5_0_r4_q8_2<6>; + mm.funcs[6] = mul_mat_q5_0_r4_q8_2<7>; + mm.funcs[7] = mul_mat_q5_0_r4_q8_2<8>; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_Q6_0_R4: assert (ne00 % QK4_NL == 0); - mm.funcs[0] = mul_mat_q6_0_r4_q8_1<1>; - mm.funcs[1] = mul_mat_q6_0_r4_q8_1<2>; - mm.funcs[2] = mul_mat_q6_0_r4_q8_1<3>; - mm.funcs[3] = mul_mat_q6_0_r4_q8_1<4>; - mm.funcs[4] = mul_mat_q6_0_r4_q8_1<5>; - mm.funcs[5] = mul_mat_q6_0_r4_q8_1<6>; - mm.funcs[6] = mul_mat_q6_0_r4_q8_1<7>; - mm.funcs[7] = mul_mat_q6_0_r4_q8_1<8>; - expected_typeB = GGML_TYPE_Q8_1_X4; + mm.funcs[0] = mul_mat_q6_0_r4_q8_2<1>; + mm.funcs[1] = mul_mat_q6_0_r4_q8_2<2>; + mm.funcs[2] = mul_mat_q6_0_r4_q8_2<3>; + mm.funcs[3] = mul_mat_q6_0_r4_q8_2<4>; + mm.funcs[4] = mul_mat_q6_0_r4_q8_2<5>; + mm.funcs[5] = mul_mat_q6_0_r4_q8_2<6>; + mm.funcs[6] = mul_mat_q6_0_r4_q8_2<7>; + mm.funcs[7] = mul_mat_q6_0_r4_q8_2<8>; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_Q8_0_R8: assert (ne00 % QK4_NL == 0); - mm.funcs[0] = mul_mat_q8_0_r8_q8_1<1>; - mm.funcs[1] = mul_mat_q8_0_r8_q8_1<2>; - mm.funcs[2] = mul_mat_q8_0_r8_q8_1<3>; - mm.funcs[3] = mul_mat_q8_0_r8_q8_1<4>; - mm.funcs[4] = mul_mat_q8_0_r8_q8_1<5>; - mm.funcs[5] = mul_mat_q8_0_r8_q8_1<6>; - mm.funcs[6] = mul_mat_q8_0_r8_q8_1<7>; - mm.funcs[7] = mul_mat_q8_0_r8_q8_1<8>; - expected_typeB = GGML_TYPE_Q8_1_X4; + mm.funcs[0] = mul_mat_q8_0_r8_q8_2<1>; + mm.funcs[1] = mul_mat_q8_0_r8_q8_2<2>; + mm.funcs[2] = mul_mat_q8_0_r8_q8_2<3>; + mm.funcs[3] = mul_mat_q8_0_r8_q8_2<4>; + mm.funcs[4] = mul_mat_q8_0_r8_q8_2<5>; + mm.funcs[5] = mul_mat_q8_0_r8_q8_2<6>; + mm.funcs[6] = mul_mat_q8_0_r8_q8_2<7>; + mm.funcs[7] = mul_mat_q8_0_r8_q8_2<8>; + expected_typeB = GGML_TYPE_Q8_2_X4; break; case GGML_TYPE_IQ1_S: mm.funcs[0] = mul_mat_iq1_s_q8_K<1>; @@ -15219,8 +15233,8 @@ template <int D, int step> struct HelperQ80 final : public BaseHelper<step> { using Base = BaseHelper<step>; #ifdef HAVE_FANCY_SIMD - using block_q8 = block_q8_1; - constexpr static int block_size_q = QK8_1; + using block_q8 = block_q8_2; + constexpr static int block_size_q = QK8_2; #else using block_q8 = block_q8_0; constexpr static int block_size_q = QK8_0; @@ -15268,6 +15282,15 @@ struct HelperQ80 final : public BaseHelper<step> { } } + static inline void convert(int nq, int stride_q, const float * q, block_q8_2 * y) { + //GGML_ASSERT(nq <= step); Why did I have this assert? + for (int i = 0; i < nq; ++i) { + quantize_row_q8_2_x4(q, y, D); + q += stride_q; + y += D/QK8_2; + } + } + static inline void convert(int nq, int stride_q, const float * q, block_q8_KV<D> * y) { for (int i = 0; i < nq; ++i) { quantize_row_q8_KV(q, y, D); @@ -15281,8 +15304,8 @@ template <int D, int step> struct HelperQ80R8 : public BaseHelper<step> { using Base = BaseHelper<step>; #ifdef __AVX2__ - constexpr static int block_size_q = QK8_1; - using block_q8 = block_q8_1; + constexpr static int block_size_q = QK8_2; + using block_q8 = block_q8_2; #else constexpr static int block_size_q = QK8_0; using block_q8 = block_q8_0; @@ -15491,8 +15514,8 @@ struct HelperQ8KVR8 : public BaseHelper<step> { template <int D, int step> struct HelperQ40 final : public BaseHelper<step> { using Base = BaseHelper<step>; - using block_q8 = block_q8_0; - constexpr static int block_size_q = QK8_0; + using block_q8 = block_q8_2; + constexpr static int block_size_q = QK8_2; HelperQ40(const char * data, int stride) : Base(data, stride) {} // Needed for v * softmax(k * q) @@ -15584,8 +15607,8 @@ struct HelperIQ4nl final : public BaseHelper<step> { constexpr static int block_size_q = QK8_0; #else HelperIQ4nl(const char * data, int stride) : Base(data, stride) {} - using block_q8 = block_q8_1; - constexpr static int block_size_q = QK8_1; + using block_q8 = block_q8_2; + constexpr static int block_size_q = QK8_2; #endif // Needed for v * softmax(k * q) @@ -15631,8 +15654,8 @@ struct HelperQ60 final : public BaseHelper<step> { using block_q8 = block_q8_0; constexpr static int block_size_q = QK8_0; #else - using block_q8 = block_q8_1; - constexpr static int block_size_q = QK8_1; + using block_q8 = block_q8_2; + constexpr static int block_size_q = QK8_2; #endif using Base = BaseHelper<step>; HelperQ60(const char * data, int stride) : Base(data, stride) {} @@ -16350,7 +16373,7 @@ struct FlashQKfp32 { MAKE_FUNCS(mul_mat_qX_0_q8_0<DequantizerQ80, nq); #else #ifdef HAVE_FANCY_SIMD - MAKE_FUNCS(mul_mat_qX_1_q8_1_T<Q8_0_1_Unpacker, nq); + MAKE_FUNCS(mul_mat_qX_1_q8_2_T<Q8_0_1_Unpacker, nq); #else MAKE_FUNCS(mul_mat_qX_0_q8_0_T<Q8_0_Unpacker, nq); #endif @@ -16373,7 +16396,7 @@ struct FlashQKfp32 { #ifdef __aarch64__ MAKE_FUNCS_ONLY_NRC(mul_mat_q8_0_r8_q8_0, nq); #else - MAKE_FUNCS_ONLY_NRC(mul_mat_q8_0_r8_q8_1, nq); + MAKE_FUNCS_ONLY_NRC(mul_mat_q8_0_r8_q8_2, nq); #endif } else if constexpr (std::is_same_v<KHelper, HelperQ8KVR8<D, k_step>>) { @@ -16383,7 +16406,7 @@ struct FlashQKfp32 { #ifdef __aarch64__ MAKE_FUNCS(mul_mat_qX_0_q8_0<DequantizerQ60, nq); #else - MAKE_FUNCS(mul_mat_qX_1_q8_1_T<Q6_0_1_Unpacker, nq); + MAKE_FUNCS(mul_mat_qX_1_q8_2_T<Q6_0_1_Unpacker, nq); #endif } #if GGML_IQK_FA_ALL_QUANTS @@ -16391,21 +16414,21 @@ struct FlashQKfp32 { #ifdef __aarch64__ MAKE_FUNCS(mul_mat_qX_0_q8_0<DequantizerQ40, nq); #else - MAKE_FUNCS(mul_mat_qX_0_q8_0_T<Q4_0_Unpacker, nq); + MAKE_FUNCS(mul_mat_qX_1_q8_2_T<Q4_0_Unpacker, nq); #endif } else if constexpr (std::is_same_v<KHelper, HelperQ41<D, k_step>>) { #ifdef __aarch64__ MAKE_FUNCS(mul_mat_qX_1_q8_1<DequantizerQ41, nq); #else - MAKE_FUNCS(mul_mat_qX_1_q8_1_T<Q4_1_Unpacker, nq); + MAKE_FUNCS(mul_mat_qX_1_q8_2_T<Q4_1_Unpacker, nq); #endif } else if constexpr (std::is_same_v<KHelper, HelperIQ4nl<D, k_step>>) { #ifdef __aarch64__ MAKE_FUNCS(mul_mat_qX_0_q8_0<DequantizerIQ4NL, nq); #else - MAKE_FUNCS(mul_mat_qX_1_q8_1_T<IQ4_NL_Unpacker, nq); + MAKE_FUNCS(mul_mat_qX_1_q8_2_T<IQ4_NL_Unpacker, nq); #endif } #endif diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 5e657f4a..cac1fd49 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -798,13 +798,14 @@ void quantize_row_q8_0_x4(const float * x, void * vy, int64_t k) { #endif } -void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { +namespace { +template <typename Block, typename Block_x4> +void quantize_row_q8_1_x4_T(const float * x, Block * y, int64_t k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; const int nb4 = 4*(nb/4); - block_q8_1 * y = (block_q8_1 *)vy; - block_q8_1_x4 * y4 = (block_q8_1_x4 *)vy; + Block_x4 * y4 = (Block_x4 *)y; #if defined(__aarch64__) for (int i = 0; i < nb; i++) { int i4 = i/4, ir = i%4; @@ -851,10 +852,18 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { accv = vaddq_s32(accv, vi); } - if (i < nb4) { - y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); + if constexpr (std::is_same_v<Block, block_q8_1>) { + if (i < nb4) { + y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); + } else { + y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); + } } else { - y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); + if (i < nb4) { + y4[i4].s[ir] = vaddvq_s32(accv); + } else { + y[i].s = vaddvq_s32(accv); + } } } #else @@ -880,13 +889,25 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { const float max_scalar = _mm_cvtss_f32( max4 ); // Quantize these floats - const float d = max_scalar / 127.f; - if (i < nb4) { - y4[i4].d[ir] = GGML_FP32_TO_FP16(d); + float d = max_scalar / 127.f; + if constexpr (std::is_same_v<Block, block_q8_1>) { + if (i < nb4) { + y4[i4].d[ir] = GGML_FP32_TO_FP16(d); + } else { + y[i].d = GGML_FP32_TO_FP16(d); + } } else { - y[i].d = GGML_FP32_TO_FP16(d); + if (i < nb4) { + auto t = GGML_FP32_TO_BF16(d); + y4[i4].d[ir] = t.bits; + d = ggml_bf16_to_fp32(t); + } else { + auto t = GGML_FP32_TO_BF16(d); + y[i].d = t.bits; + d = ggml_bf16_to_fp32(t); + } } - const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; + const float id = d > 0 ? 1/d : 0.f; const __m256 mul = _mm256_set1_ps( id ); // Apply the multiplier @@ -908,10 +929,19 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { __m256i i3 = _mm256_cvtps_epi32( v3 ); // Compute the sum of the quants and set y[i].s - if (i < nb4) { - y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + int isum = hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + if constexpr (std::is_same_v<Block, block_q8_1>) { + if (i < nb4) { + y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * isum); + } else { + y[i].s = GGML_FP32_TO_FP16(d * isum); + } } else { - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + if (i < nb4) { + y4[i4].d[ir+4] = GGML_FP32_TO_BF16(d * isum).bits; + } else { + y[i].s = GGML_FP32_TO_BF16(d * isum).bits; + } } // Convert int32 to int16 @@ -934,6 +964,15 @@ void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { } #endif } +} + +void quantize_row_q8_1_x4(const float * x, void * vy, int64_t k) { + quantize_row_q8_1_x4_T<block_q8_1, block_q8_1_x4>(x, (block_q8_1 *)vy, k); +} + +void quantize_row_q8_2_x4(const float * x, void * vy, int64_t k) { + quantize_row_q8_1_x4_T<block_q8_2, block_q8_2_x4>(x, (block_q8_2 *)vy, k); +} // // ============================================== iq2_K diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index dd148f2e..478bd0de 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -238,6 +238,7 @@ void quantize_row_q8_K32(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_q8_KR8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_0_x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_1_x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_2_x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row); void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row); |