diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-02-09 09:14:52 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-09 09:14:52 +0200 |
commit | 33390c4b74fa52875d6028c5c9aaf84f17288c25 (patch) | |
tree | 97578e59530316c4b0637590d28e7e46ac8bb892 | |
parent | 6d7b58eade37e45e3d8286a2353658047539d2b2 (diff) |
Use Q8_K_128 for IQ1_S_R4 and IQ1_M_R4 matrix multiplications (#194)
* iq1_s_r4: Use Q8_K_128 instead of Q8_1_X4 for gemm (AVX2/Zen4)
* iq1_m_r4: Use Q8_K_128 instead of Q8_1_X4 for gemm (AVX2/Zen4)
* iq1_s_r4: Use Q8_K_128 instead of Q8_1_X4 for gemm (Neon)
* iq1_m_r4: Use Q8_K_128 instead of Q8_0_X4 for gemm (Neon)
* Simdify q8_K128 quantization also on Neon
* Cleanup
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
-rw-r--r-- | ggml/include/ggml.h | 1 | ||||
-rw-r--r-- | ggml/src/ggml-common.h | 5 | ||||
-rw-r--r-- | ggml/src/ggml.c | 13 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_mul_mat.cpp | 75 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 117 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_quantize.h | 1 |
6 files changed, 169 insertions, 43 deletions
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c307d42e..66bcb25a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -415,6 +415,7 @@ extern "C" { GGML_TYPE_Q8_K16 = 147, GGML_TYPE_Q8_K32 = 148, GGML_TYPE_Q8_KR8 = 149, + GGML_TYPE_Q8_K128 = 150, GGML_TYPE_Q4_0_R8 = 202, GGML_TYPE_Q5_0_R4 = 206, diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 0d014c23..4308f0b9 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -377,15 +377,16 @@ typedef struct { } block_q8_K; static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); typedef struct { - float d; // delta + float d; // delta int8_t qs[64]; // quants } block_q8_K64; static_assert(sizeof(block_q8_K64) == sizeof(float) + 64, "wrong q8_K64 block size/padding"); typedef struct { float d; // delta + int16_t bsums[4]; // quant sums for blocks of 32 int8_t qs[128]; // quants } block_q8_K128; -static_assert(sizeof(block_q8_K128) == sizeof(float) + 128, "wrong q8_K128 block size/padding"); +static_assert(sizeof(block_q8_K128) == sizeof(float) + 4*sizeof(int16_t) + 128, "wrong q8_K128 block size/padding"); typedef struct { ggml_half d[8]; // delta diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b19fb006..e07dd547 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1192,7 +1192,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_iq1_s_r4, .from_float_ref = (ggml_from_float_t)quantize_row_iq1_s_r4_ref, .vec_dot = vec_dot_iq1_s_r4_q8_k, - .vec_dot_type = GGML_TYPE_Q8_1_X4, + .vec_dot_type = GGML_TYPE_Q8_K128, .nrows = 1, .row_meta_size = 2, }, @@ -1218,7 +1218,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_iq1_m_r4, .from_float_ref = (ggml_from_float_t)quantize_row_iq1_m_r4_ref, .vec_dot = vec_dot_iq1_m_r4_q8_k, - .vec_dot_type = GGML_TYPE_Q8_0_X4, + .vec_dot_type = GGML_TYPE_Q8_K128, .nrows = 1, .row_meta_size = 2, }, @@ -1354,6 +1354,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_K64, .row_meta_size = 0, }, + [GGML_TYPE_Q8_K128] = { + .type_name = "q8_K128", + .blck_size = 128, + .type_size = sizeof(block_q8_K128), + .is_quantized = true, + .from_float = quantize_row_q8_K128, + .row_meta_size = 0, + }, [GGML_TYPE_Q8_K16] = { .type_name = "q8_K16", .blck_size = 64, @@ -16161,6 +16169,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_Q8_K: case GGML_TYPE_Q8_K64: + case GGML_TYPE_Q8_K128: case GGML_TYPE_Q8_K16: case GGML_TYPE_Q8_K32: case GGML_TYPE_Q4_0_4_4: diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index c561ca2b..aeba2c59 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -3528,26 +3528,27 @@ static void mul_mat_q4_0_r8_q8_1_avx2(int n, const void * vx, size_t bx, const D template <int nrc_y> static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_K128> q8(info); int nb = n / 32; GGML_ASSERT(nb%4 == 0); __m256i qx[4]; __m256 acc[nrc_y] = {}; auto m1 = _mm256_set1_epi16(1); auto ms = _mm_set1_epi16(-32768); - float d8[8*nrc_y]; + float d8[4*nrc_y]; union { __m256i vec; uint16_t val[16]; } helper; struct aux_iq1_s_r4 { uint8_t qs[16]; uint64_t qh; }; - for (int ix= 0; ix < nrc_x; ix += 4) { + for (int ix = 0; ix < nrc_x; ix += 4) { auto dptr = (const ggml_half *)((const char *)vx + ix*bx); auto d1 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)dptr)); auto x = (const aux_iq1_s_r4 *)(dptr + 4); for (int ib = 0; ib < nb/4; ++ib) { for (int iy = 0; iy < nrc_y; ++iy) { - _mm256_storeu_ps(d8 + 8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib].d))); + auto bsums = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib].bsums)); + _mm_storeu_ps(d8 + 4*iy, _mm_mul_ps(_mm_set1_ps(q8.y[iy][ib].d), _mm_cvtepi32_ps(bsums))); } for (int k = 0; k < 4; ++k) { auto idxh = _mm256_set1_epi64x(x[4*ib+k].qh); @@ -3556,8 +3557,8 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI scales4 = _mm_or_si128(_mm_slli_epi16(scales4, 1), _mm_set1_epi16(1)); auto signs = _mm_or_si128(_mm_cmpeq_epi16(_mm_and_si128(sas, ms), ms), _mm256_castsi256_si128(m1)); signs = _mm_add_epi16(_mm_set1_epi16(-8), signs); - auto delta4 = _mm_mul_ps(_mm_set1_ps(0.0625f), _mm_cvtepi32_ps(_mm_cvtepi16_epi32( - _mm_mullo_epi16(scales4, signs)))); + signs = _mm_mullo_epi16(signs, scales4); + auto delta4 = _mm_mul_ps(_mm_set1_ps(0.0625f), _mm_cvtepi32_ps(_mm_cvtepi16_epi32(signs))); auto delta = _mm256_set_m128(delta4, delta4); scales4 = _mm_unpacklo_epi16(scales4, scales4); // 0,0, 1,1, 2,2, 3,3 auto scales = MM256_SET_M128I(scales4, scales4); @@ -3598,8 +3599,8 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI auto sumi = _mm256_packs_epi32(sumi1, sumi2); #endif sumi = _mm256_madd_epi16(scales, sumi); - acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+0]), _mm256_cvtepi32_ps(sumi), acc[iy]); - acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+4]), delta, acc[iy]); + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(q8.y[iy][ib].d), _mm256_cvtepi32_ps(sumi), acc[iy]); + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[4*iy+k]), delta, acc[iy]); } } } @@ -3614,7 +3615,7 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI template <int nrc_y> static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); - Q8<nrc_y, block_q8_0_x4> q8(info); + Q8<nrc_y, block_q8_K128> q8(info); int nb = n / 32; GGML_ASSERT(nb%4 == 0); auto shuffle0 = _mm256_set_epi64x(0x0909090909090909, 0x0808080808080808, 0x0101010101010101, 0x0000000000000000); @@ -3624,17 +3625,14 @@ static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataI #endif __m256i qx[4]; __m256 acc[nrc_y] = {}; + __m256i isum[nrc_y] = {}; auto ms = _mm_set1_epi8(0x08); - float d8[4*nrc_y]; union { __m256i vec; uint16_t val[16]; } helper; for (int ix= 0; ix < nrc_x; ix += 4) { auto dptr = (const ggml_half *)((const char *)vx + ix*bx); auto d1 = _mm_mul_ps(_mm_set1_ps(0.125f), _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)dptr))); auto x = (const block_iq1_m_r4 *)(dptr + 4); for (int ib = 0; ib < nb/4; ++ib) { - for (int iy = 0; iy < nrc_y; ++iy) { - _mm_storeu_ps(d8 + 4*iy, _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib].d))); - } for (int k = 0; k < 4; ++k) { auto qh = (const uint32_t *)x[4*ib+k].qh; auto idxh = _mm_set_epi32(qh[1] >> 4, qh[1], qh[0] >> 4, qh[0]); @@ -3694,10 +3692,13 @@ static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataI // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3 as int16_t auto sumi = _mm256_packs_epi32(sumi1, sumi2); #endif - sumi = _mm256_madd_epi16(scales, sumi); - acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[4*iy+k]), _mm256_cvtepi32_ps(sumi), acc[iy]); + isum[iy] = _mm256_add_epi32(isum[iy], _mm256_madd_epi16(scales, sumi)); } } + for (int iy = 0; iy < nrc_y; ++iy) { + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(q8.y[iy][ib].d), _mm256_cvtepi32_ps(isum[iy]), acc[iy]); + isum[iy] = _mm256_setzero_si256(); + } } for (int iy = 0; iy < nrc_y; ++iy) { auto sumf = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1)); @@ -9177,7 +9178,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { #ifdef HAVE_FANCY_SIMD mm.func16 = mul_mat_iq1_s_r4_q8_1<16>; #endif - expected_typeB = GGML_TYPE_Q8_1_X4; + expected_typeB = GGML_TYPE_Q8_K128; break; case GGML_TYPE_IQ1_M_R4: assert (ne00 % QK4_NL == 0); @@ -9192,7 +9193,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { #ifdef HAVE_FANCY_SIMD mm.func16 = mul_mat_iq1_m_r4_q8_0<16>; #endif - expected_typeB = GGML_TYPE_Q8_0_X4; + expected_typeB = GGML_TYPE_Q8_K128; break; default: @@ -12072,7 +12073,7 @@ static void mul_mat_iq2_xs_r4_q8_k(int n, const void * vx, size_t bx, const Data static void mul_mat_iq1_s_r4_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); - Q8<1, block_q8_1_x4> q8(info); + Q8<1, block_q8_K128> q8(info); int nb = n / 32; GGML_ASSERT(nb%4 == 0); int8x16_t qx[8]; @@ -12084,8 +12085,8 @@ static void mul_mat_iq1_s_r4_q8_1_1(int n, const void * vx, size_t bx, const Dat auto d1 = vcvt_f32_f16(vld1_f16((const float16_t *)dptr)); auto x = (const block_iq1_s_r4 *)(dptr + 4); for (int ib = 0; ib < nb/4; ++ib) { - auto scale_yd = vcvt_f32_f16(vld1_f16((const float16_t *)q8.y[0][ib].d+0)); - auto scale_ym = vcvt_f32_f16(vld1_f16((const float16_t *)q8.y[0][ib].d+4)); + auto scale_yd = vdupq_n_f32(q8.y[0][ib].d); + auto scale_ym = vmulq_f32(scale_yd, vcvtq_f32_s32(vmovl_s16(vld1_s16(q8.y[0][ib].bsums)))); for (int k = 0; k < 4; ++k) { auto sas = vld1_u16(x[4*ib+k].qh); auto scales4 = vand_u16(vshr_n_u16(sas, 12), vdup_n_u16(7)); @@ -12135,23 +12136,22 @@ static void mul_mat_iq1_s_r4_q8_1_1(int n, const void * vx, size_t bx, const Dat template <int nrc_y> static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); - Q8<nrc_y, block_q8_1_x4> q8(info); + Q8<nrc_y, block_q8_K128> q8(info); int nb = n / 32; GGML_ASSERT(nb%4 == 0); uint8x16_t qx[8]; int32x4_t acc[nrc_y] = {}; auto ms = vdup_n_u16(0x8000); auto mask = vdupq_n_s8(0x03); - float d8[8*nrc_y]; + float d8[4*nrc_y]; for (int ix= 0; ix < nrc_x; ix += 4) { auto dptr = (const ggml_half *)((const char *)vx + ix*bx); auto d1 = vcvt_f32_f16(vld1_f16((const float16_t *)dptr)); auto x = (const block_iq1_s_r4 *)(dptr + 4); for (int ib = 0; ib < nb/4; ++ib) { for (int iy = 0; iy < nrc_y; ++iy) { - auto scales = vld1q_f16((const float16_t *)q8.y[iy][ib].d); - vst1q_f32(d8+8*iy+0, vcvt_f32_f16(vget_low_f16(scales))); - vst1q_f32(d8+8*iy+4, vcvt_f32_f16(vget_high_f16(scales))); + auto scales = vcvtq_f32_s32(vmovl_s16(vld1_s16(q8.y[iy][ib].bsums))); + vst1q_f32(d8+4*iy, vmulq_f32(vdupq_n_f32(q8.y[iy][ib].d), scales)); } for (int k = 0; k < 4; ++k) { auto sas = vld1_u16(x[4*ib+k].qh); @@ -12193,8 +12193,8 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI sumi = vdotq_laneq_s32(sumi, vreinterpretq_s8_u8(qx[6]), y.val[1], 2); sumi = vdotq_laneq_s32(sumi, vreinterpretq_s8_u8(qx[7]), y.val[1], 3); sumi = vmulq_s32(scales, sumi); - acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[8*iy+k+0]), vcvtq_f32_s32(sumi)); - acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[8*iy+k+4]), delta4); + acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(q8.y[iy][ib].d), vcvtq_f32_s32(sumi)); + acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[4*iy+k]), delta4); } } } @@ -12208,25 +12208,21 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI template <int nrc_y> static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%4 == 0); - Q8<nrc_y, block_q8_0_x4> q8(info); + Q8<nrc_y, block_q8_K128> q8(info); int nb = n / 32; GGML_ASSERT(nb%4 == 0); int8x16_t qx[8]; - int32x4_t acc[nrc_y] = {}; + float32x4_t acc[nrc_y] = {}; + int32x4_t isum[nrc_y] = {}; auto shuffle0 = uint32x4_t{0x00000000, 0x01010101, 0x02020202, 0x03030303}; auto step = vdupq_n_u8(4); auto ms = vdupq_n_u8(0x08); auto mask = vdupq_n_s8(0x18); - float d8[4*nrc_y]; for (int ix= 0; ix < nrc_x; ix += 4) { auto dptr = (const ggml_half *)((const char *)vx + ix*bx); auto d1 = vmulq_f32(vdupq_n_f32(0.125f), vcvt_f32_f16(vld1_f16((const float16_t *)dptr))); auto x = (const block_iq1_m_r4 *)(dptr + 4); for (int ib = 0; ib < nb/4; ++ib) { - for (int iy = 0; iy < nrc_y; ++iy) { - auto scales = vld1_f16((const float16_t *)q8.y[iy][ib].d); - vst1q_f32(d8+4*iy, vcvt_f32_f16(scales)); - } for (int k = 0; k < 4; ++k) { auto scales4 = vdup_n_u32(((const uint32_t *)x[4*ib+k].scales)[0]); scales4 = vand_u8(vshl_u32(scales4, int32x2_t{0, -4}), vdup_n_u8(0xf)); @@ -12272,10 +12268,13 @@ static void mul_mat_iq1_m_r4_q8_0(int n, const void * vx, size_t bx, const DataI sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[5]), y.val[1], 1); sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[6]), y.val[1], 2); sumi2 = vdotq_laneq_s32(sumi2, vreinterpretq_s8_u8(qx[7]), y.val[1], 3); - auto sumi = vmlaq_s32(vmlaq_s32(vdupq_n_s32(0), sumi1, scales1), sumi2, scales2); - acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(d8[4*iy+k]), vcvtq_f32_s32(sumi)); + isum[iy] = vmlaq_s32(vmlaq_s32(isum[iy], sumi1, scales1), sumi2, scales2); } } + for (int iy = 0; iy < nrc_y; ++iy) { + acc[iy] = vfmaq_f32(acc[iy], vdupq_n_f32(q8.y[iy][ib].d), vcvtq_f32_s32(isum[iy])); + isum[iy] = vdupq_n_s32(0); + } } for (int iy = 0; iy < nrc_y; ++iy) { info.store(ix, iy, vmulq_f32(d1, acc[iy])); @@ -13907,12 +13906,12 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) { SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq1_s_r4_q8_1); m.funcs[0] = mul_mat_iq1_s_r4_q8_1_1; m.func16 = mul_mat_iq1_s_r4_q8_1<16>; - expected_Btype = GGML_TYPE_Q8_1_X4; + expected_Btype = GGML_TYPE_Q8_K128; break; case GGML_TYPE_IQ1_M_R4: SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq1_m_r4_q8_0); m.func16 = mul_mat_iq1_m_r4_q8_0<16>; - expected_Btype = GGML_TYPE_Q8_0_X4; + expected_Btype = GGML_TYPE_Q8_K128; break; case GGML_TYPE_IQ3_XXS_R4: SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq3_xxs_r4_q8_k); diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index a01ed109..f33fc183 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -2733,6 +2733,7 @@ size_t quantize_iq6_k(const float * src, void * dst, int64_t nrows, int64_t n_pe return nrows * nblock * sizeof(block_iq6_k); } +namespace { template <int q8_type> void iqk_quantize_row_q8_K_T(const float * x, void * vy, int64_t k) { assert(k % QK_K == 0); @@ -2843,7 +2844,7 @@ void iqk_quantize_row_q8_K_T(const float * x, void * vy, int64_t k) { x += QK_K; } #endif - +} } void iqk_quantize_row_q8_K(const float * x, void * vy, int64_t k) { @@ -2859,6 +2860,120 @@ void quantize_row_q8_KR8(const float * x, void * vy, int64_t k) { } namespace { +// TODO: merge this with the above template +void iqk_quantize_row_q8_K128(const float * x, void * vy, int64_t k) { + constexpr int kBlockSize = 128; + assert(k % kBlockSize == 0); + const int nb = k / kBlockSize; + auto y = (block_q8_K128 *)vy; +#ifdef __AVX2__ + const __m256 signBit = _mm256_set1_ps(-0.0f); + const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + for (int i = 0; i < nb; i++) { + const float * xb = x + i*kBlockSize; + __m256 maxAbs = _mm256_setzero_ps(); + const float * xx = xb; + for (int ib = 0; ib < kBlockSize/8; ++ib) { + const __m256 v = _mm256_loadu_ps(xx); xx += 8; + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps(signBit, v)); + } + const float maxScalar = hmax_f32_8(maxAbs); + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + xx = xb; + int8_t * q8 = y[i].qs; + for (int ib = 0; ib < kBlockSize/32; ++ib) { + __m256 v0 = _mm256_mul_ps(mul, _mm256_loadu_ps(xx)); xx += 8; + __m256 v1 = _mm256_mul_ps(mul, _mm256_loadu_ps(xx)); xx += 8; + __m256 v2 = _mm256_mul_ps(mul, _mm256_loadu_ps(xx)); xx += 8; + __m256 v3 = _mm256_mul_ps(mul, _mm256_loadu_ps(xx)); xx += 8; + v0 = _mm256_round_ps(v0, _MM_ROUND_NEAREST); + v1 = _mm256_round_ps(v1, _MM_ROUND_NEAREST); + v2 = _mm256_round_ps(v2, _MM_ROUND_NEAREST); + v3 = _mm256_round_ps(v3, _MM_ROUND_NEAREST); + __m256i i0 = _mm256_cvtps_epi32(v0); + __m256i i1 = _mm256_cvtps_epi32(v1); + __m256i i2 = _mm256_cvtps_epi32(v2); + __m256i i3 = _mm256_cvtps_epi32(v3); + y[i].bsums[ib] = hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + i0 = _mm256_packs_epi32( i0, i1 ); + i2 = _mm256_packs_epi32( i2, i3 ); + i0 = _mm256_packs_epi16( i0, i2 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + _mm256_storeu_si256((__m256i *)q8, i0); + q8 += 32; + } + } +#elif defined __ARM_NEON + int32x4_t ival[8]; + for (int i = 0; i < nb; i++) { + const float * xb = x + i*kBlockSize; + auto vmax = vdupq_n_f32(0.f); + for (int j = 0; j < kBlockSize; j += 4) { + vmax = vmaxq_f32(vmax, vabsq_f32(vld1q_f32(xb + j))); + } + auto smax = vmaxvq_f32(vmax); + if (!smax) { + std::memset(&y[i], 0, sizeof(y[i])); + continue; + } + y[i].d = smax/127; + auto vid = vdupq_n_f32(127/smax); + for (int ib = 0; ib < kBlockSize/32; ++ib) { + auto isum = vdupq_n_s32(0); + for (int k = 0; k < 8; ++k) { + auto val = vld1q_f32(xb + 32*ib + 4*k); + ival[k] = vcvtnq_s32_f32(vmulq_f32(val, vid)); + isum = vaddq_s32(isum, ival[k]); + } + y[i].bsums[ib] = vaddvq_s32(isum); + for (int k = 0; k < 4; ++k) { + auto i16 = vcombine_s16(vmovn_s32(ival[2*k+0]), vmovn_s32(ival[2*k+1])); + vst1_s8(y[i].qs + 32*ib + 8*k, vmovn_s16(i16)); + } + } + } +#else + for (int i = 0; i < nb; i++) { + + float amax = 0; + for (int j = 0; j < kBlockSize; ++j) { + float ax = std::abs(x[j]); + amax = std::max(amax, ax); + } + if (!amax) { + y[i].d = 0; + memset(y[i].qs, 0, kBlockSize); + memset(y[i].bsums, 0, kBlockSize/32*(sizeof(int16_t))); + x += kBlockSize; + continue; + } + const float iscale = 127.f/amax; + for (int j = 0; j < kBlockSize; ++j) { + int v = nearest_int(iscale*x[j]); + y[i].qs[j] = v; + } + for (int j = 0; j < kBlockSize/32; ++j) { + int sum = 0; + for (int ii = 0; ii < 32; ++ii) { + sum += y[i].qs[j*32 + ii]; + } + y[i].bsums[j] = sum; + } + y[i].d = 1/iscale; + x += kBlockSize; + } +#endif +} +} + +void quantize_row_q8_K128(const float * x, void * vy, int64_t k) { + iqk_quantize_row_q8_K128(x, vy, k); +} + +namespace { static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int block_size, int n_per_row, const float * x, char * cy, float * all_scales, float * weight, diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index ff553ae7..97719361 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -220,6 +220,7 @@ void vec_dot_q8_k_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const voi void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_K128(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K32(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_KR8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); |