diff options
-rw-r--r-- | ggml/src/iqk/iqk_mul_mat.cpp | 19 |
1 files changed, 1 insertions, 18 deletions
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index a70310d4..21fe99e1 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -2895,20 +2895,6 @@ template <typename Q8, typename Q8x4, typename Dot, bool can_pack = true> struct } } }; -// If I use this, it negatively impacts q4_1/q5_1 performance. -//template <typename Q8, typename Q8x4, typename Dot> struct Sum4 { -// Dot dot; -// inline __m256i compute(const __m256i * qx, const Q8 * y) const { -// const Q8x4 * y4 = (const Q8x4 *)y; -// const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0)); // 8x block 0 -// const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1)); // 8x block 1 -// const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2)); // 8x block 2 -// const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3)); // 8x block 3 -// auto p01 = _mm256_add_epi32(_mm256_unpacklo_epi32(p0, p1), _mm256_unpackhi_epi32(p0, p1)); // 0,1, 0,1, 0,1, 0,1 -// auto p23 = _mm256_add_epi32(_mm256_unpacklo_epi32(p2, p3), _mm256_unpackhi_epi32(p2, p3)); // 2,3, 2,3, 2,3, 2,3 -// return _mm256_add_epi32(_mm256_unpacklo_epi64(p01, p23), _mm256_unpackhi_epi64(p01, p23)); // 0,1,2,3, 0,1,2,3 -// } -//}; struct ScaleHelperQ8_0 { inline __m128 prepare4(const block_q8_0 * y) { @@ -6849,7 +6835,6 @@ void quantize_row_q8_1(const float * x, block_q8_1 * y, int k) { template <int D, int step> struct HelperQ80 final : public BaseHelper<step> { - static_assert(step == QK8_0); using Base = BaseHelper<step>; using block_q8 = block_q8_0; HelperQ80(const char * data, int stride) : Base(data, stride) {} @@ -6898,7 +6883,6 @@ struct HelperQ80 final : public BaseHelper<step> { template <int D, int step> struct HelperQ40 final : public BaseHelper<step> { - static_assert(step == QK4_0); using Base = BaseHelper<step>; using block_q8 = block_q8_0; HelperQ40(const char * data, int stride) : Base(data, stride) {} @@ -6942,7 +6926,6 @@ struct HelperQ40 final : public BaseHelper<step> { template <int D, int step> struct HelperQ41 final : public BaseHelper<step> { - static_assert(step == QK4_1); using Base = BaseHelper<step>; using block_q8 = block_q8_1; HelperQ41(const char * data, int stride) : Base(data, stride) {} @@ -7268,7 +7251,7 @@ struct FlashQKV { F16::Data v1, v2; for (int l1 = 0; l1 < k_step; ++l1) { vh.load(l1, i, v1, v2); - for (int j = 0; j < q_step; ++j) { + for (int j = 0; j < nq1; ++j) { auto vs = F16::set1(fms.cache[k_step*j + l1]); vk[2*j+0] = F16::fmadd(vk[2*j+0], v1, vs); vk[2*j+1] = F16::fmadd(vk[2*j+1], v2, vs); |