summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKawrakow <iwankawrakow@gmail.com>2024-10-01 10:56:50 +0300
committerGitHub <noreply@github.com>2024-10-01 10:56:50 +0300
commitc2ff4f936a3060cb1ef6adc6e7c2664324c89d84 (patch)
tree621e9012f130f4a6d852f464b49bca9151ef372b
parent8cba4789da860d32cfc6d14f96ed37ade9e334bd (diff)
iqk_mul_mat: better iq4_nl implementation on Zen4/AVX2 (#72)
* iqk_mul_mat: better iq4_nl implementation on Zen4/AVX2 PP-512 performance for LLaMA-3.1-8B goes to 162.6 t/s up from 133.2 t/s. * Fix AVX2 In addition to fixing iq4_nl, it seems I never adhusted the AVX2 implementation for iq2_tn to the block scale removal? This commit also fixes that. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
-rw-r--r--ggml/src/ggml.c4
-rw-r--r--ggml/src/iqk/iqk_mul_mat.cpp48
2 files changed, 22 insertions, 30 deletions
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 184a31a8..ee83fc43 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1049,7 +1049,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float = quantize_row_iq4_nl,
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
+#if GGML_USE_IQK_MULMAT && defined __AVX2__
+ .vec_dot_type = GGML_TYPE_Q8_1,
+#else
.vec_dot_type = GGML_TYPE_Q8_0,
+#endif
.nrows = 1,
.row_meta_size = 0,
},
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index 568e577c..1183246b 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -542,6 +542,12 @@ struct SimpleBits {
__m256i values[4];
};
+__m256i inline load_iq4nl_values_256() {
+ static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
+ auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
+ return MM256_SET_M128I(val128, val128);
+}
+
#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================
@@ -609,10 +615,8 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
Scales8K s8k;
};
-__m512i load_iq4nl_values_512() {
- static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
- auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
- auto val256 = MM256_SET_M128I(val128, val128);
+__m512i inline load_iq4nl_values_512() {
+ auto val256 = load_iq4nl_values_256();
return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}
@@ -1418,14 +1422,8 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
Scales8K s8k;
};
-__m256i load_iq4nl_values() {
- static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
- auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
- return MM256_SET_M128I(val128, val128);
-}
-
struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
- DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values()) {}
+ DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_256()) {}
template <typename Q8>
inline __m256i new_block(int i, const Q8& q8, __m256 * accd) {
d = GGML_FP16_TO_FP32(x[i].d);
@@ -1563,7 +1561,7 @@ struct DequantizerIQ3K final : public BaseDequantizer<block_iq3_k> {
};
struct DequantizerIQ4K final : public BaseDequantizer<block_iq4_k> {
- DequantizerIQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx), iqxk(4, -128), values(load_iq4nl_values()) {}
+ DequantizerIQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx), iqxk(4, -128), values(load_iq4nl_values_256()) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
@@ -1780,12 +1778,9 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
const __m256i mh = _mm256_set1_epi8(0x30);
};
-struct DequantizerIQ2TN final : public BaseDequantizer<block_iq2_tn> {
+struct DequantizerIQ2TN final : public BaseDequantizer<block_iq2_tn, true> {
DequantizerIQ2TN(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
- inline void new_block(int i) {
- d = GGML_FP16_TO_FP32(x[i].d);
- }
inline void prepare(int i, int j) {
bits.prepare(x[i].qs, j);
}
@@ -1812,8 +1807,6 @@ IQK_NOINLINE void mul_mat_iq2tn_q8_K(int n, const void * vx, size_t bx, const Da
for (int i = 0; i < nb; ++i) {
- deq1.new_block(i);
-
if constexpr (nrc_y == 1) {
deq1.prepare(i, 0);
auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(deq1.bits.values[0], q8.load_quants(0, i, 0)),
@@ -3181,15 +3174,10 @@ struct Q4_0_1_Dequantizer {
struct IQ4_NL_Dequantizer {
Dequantizer4bit b4;
- const __m256i values = load_values();
+ const __m256i values = load_iq4nl_values_256();
inline __m256i dequant(const block_iq4_nl * x) const {
return _mm256_shuffle_epi8(values, b4.dequant(x->qs));
}
- static __m256i load_values() {
- static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
- auto aux = _mm_loadu_si128((const __m128i *)iq4nl_values);
- return MM256_SET_M128I(aux, aux);
- }
};
struct Q4_1_Dequantizer {
@@ -3315,9 +3303,9 @@ struct Q4_0_1_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0_1<8>
using Sum4T = Sum4TypeQ81;
inline static int block_size() { return QK4_0; }
};
-struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0, IQ4_NL_Dequantizer> {
+struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0_1<128>, IQ4_NL_Dequantizer> {
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
- using Sum4T = Sum4TypeQ80;
+ using Sum4T = Sum4TypeQ81;
inline static int block_size() { return QK4_NL; }
};
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
@@ -3341,7 +3329,7 @@ struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_
inline static int block_size() { return QK4_1; }
};
-// float matrices - we handle f16 and f32, but only to f32 result
+// float matrices - we handle f16, bf16 (if native bf16 support is available) and f32, but only to f32 result
struct QFBase {
#ifdef __AVX512F__
@@ -3624,7 +3612,7 @@ void mul_mat_q80_q80_T(int n, const void * vx, size_t bx, const DataInfo& info,
template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
- std::is_same_v<Dequantizer, Q8_0_Unpacker> || std::is_same_v<Dequantizer, IQ4_NL_Unpacker>) {
+ std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
@@ -3636,7 +3624,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
}
else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker> ||
std::is_same_v<Dequantizer, Q8_0_1_Unpacker> || std::is_same_v<Dequantizer, Q4_0_1_Unpacker> ||
- std::is_same_v<Dequantizer, Q5_0_1_Unpacker>) {
+ std::is_same_v<Dequantizer, Q5_0_1_Unpacker> || std::is_same_v<Dequantizer, IQ4_NL_Unpacker>) {
m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
@@ -3933,7 +3921,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
case GGML_TYPE_IQ4_NL:
assert (ne00 % QK4_NL == 0);
MulMat::set_functions<IQ4_NL_Unpacker>(mm);
- expected_typeB = GGML_TYPE_Q8_0;
+ expected_typeB = GGML_TYPE_Q8_1;
break;
default: