summaryrefslogtreecommitdiff
path: root/ggml
diff options
context:
space:
mode:
Diffstat (limited to 'ggml')
-rw-r--r--ggml/src/ggml.c48
-rw-r--r--ggml/src/iqk/iqk_mul_mat.cpp71
-rw-r--r--ggml/src/iqk/iqk_quantize.cpp69
-rw-r--r--ggml/src/iqk/iqk_quantize.h3
4 files changed, 138 insertions, 53 deletions
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index faf1902d..a2bdc156 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1756,6 +1756,15 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
return type_traits[type];
}
+static inline int ggml_packed_rows(enum ggml_type type) {
+ return type == GGML_TYPE_BF16_R16 ? 16
+ : type == GGML_TYPE_Q8_K_R8 || type == GGML_TYPE_Q8_KV_R8 ||
+ type == GGML_TYPE_Q8_0_R8 || type == GGML_TYPE_Q4_0_R8 ||
+ type == GGML_TYPE_IQ4_XS_R8 ? 8
+ : type >= GGML_TYPE_Q4_0_R8 && type <= GGML_TYPE_Q8_K_R8 ? 4
+ : 1;
+}
+
//
// simd mappings
//
@@ -10119,9 +10128,11 @@ static void ggml_compute_forward_dup_f32(
}
// parallelize by rows
+ int n_packed = ggml_packed_rows(dst->type);
+ GGML_ASSERT(dst->ne[1] % n_packed == 0);
const int nr = ne01;
// number of rows per thread
- const int dr = (nr + nth - 1) / nth;
+ const int dr = n_packed*((nr/n_packed + nth - 1) / nth);
// row range for this thread
const int ir0 = dr * ith;
const int ir1 = MIN(ir0 + dr, nr);
@@ -10173,10 +10184,10 @@ static void ggml_compute_forward_dup_f32(
for (int i03 = 0; i03 < ne03; i03++) {
for (int i02 = 0; i02 < ne02; i02++) {
id += rs * ir0;
- for (int i01 = ir0; i01 < ir1; i01++) {
+ for (int i01 = ir0; i01 < ir1; i01 += n_packed) {
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
- quantize_row_q(src0_ptr, dst_ptr + id, ne00);
- id += rs;
+ quantize_row_q(src0_ptr, dst_ptr + id, ne00*n_packed);
+ id += rs*n_packed;
}
id += rs * (ne01 - ir1);
}
@@ -10441,10 +10452,15 @@ static void ggml_compute_forward_dup_bytes(
// parallelize by rows
const int nr = ne01;
+ const int n_packed = ggml_packed_rows(dst->type);
+ GGML_ASSERT(nr%n_packed == 0);
+ const int nrp = nr/n_packed;
// number of rows per thread
- const int dr = (nr + nth - 1) / nth;
+ const int drp = (nrp + nth - 1) / nth;
+ const int dr = drp*n_packed;
// row range for this thread
const int ir0 = dr * ith;
+ if (ir0 >= nr) return;
const int ir1 = MIN(ir0 + dr, nr);
if (src0->type == dst->type &&
@@ -10569,10 +10585,19 @@ static void ggml_compute_forward_dup_q(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_is_quantized(dst->src[0]->type));
+
int64_t nrows = ggml_nrows(dst);
int ith = params->ith;
int nth = params->nth;
+ if (dst->src[0]->type == dst->type &&
+ dst->src[0]->nb[0] == ggml_type_size(dst->type) &&
+ dst->nb[0] == ggml_type_size(dst->type)) {
+ ggml_compute_forward_dup_bytes(params, dst);
+ return;
+ }
+
if (dst->type == GGML_TYPE_Q8_0 && dst->src[0]->type == GGML_TYPE_Q8_0 &&
ggml_are_same_shape(dst, dst->src[0])) {
@@ -10626,6 +10651,10 @@ static void ggml_compute_forward_dup_q(
return;
}
+ if (dst->type != GGML_TYPE_F32) {
+ printf("%s: %s -> %s is of type %s\n", __func__, dst->src[0]->name, dst->name, ggml_type_name(dst->type));
+ GGML_ABORT("fatal error");
+ }
GGML_ASSERT(dst->type == GGML_TYPE_F32);
struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->ne[0] == dst->ne[0] && src0->nb[0] == ggml_type_size(src0->type));
@@ -10633,12 +10662,15 @@ static void ggml_compute_forward_dup_q(
ggml_to_float_t to_float = type_traits[src0->type].to_float;
GGML_ASSERT(to_float != NULL);
- int64_t n_per_thread = (nrows + nth - 1)/nth;
+ int n_packed = ggml_packed_rows(src0->type);
+ GGML_ASSERT(src0->ne[1] % n_packed == 0);
+
+ int64_t n_per_thread = n_packed*((nrows/n_packed + nth - 1)/nth);
int64_t first_row = ith*n_per_thread;
if (first_row >= nrows) return;
int64_t last_row = MIN(first_row + n_per_thread, nrows);
- for (int64_t ir = first_row; ir < last_row; ++ir) {
+ for (int64_t ir = first_row; ir < last_row; ir += n_packed) {
int64_t i03 = ir/(src0->ne[1]*src0->ne[2]);
int64_t i02 = (ir - i03*src0->ne[1]*src0->ne[2])/src0->ne[1];
int64_t i01 = ir - i03*src0->ne[1]*src0->ne[2] - i02*src0->ne[1];
@@ -10649,7 +10681,7 @@ static void ggml_compute_forward_dup_q(
const char * q = (const char *)src0->data + i03*src0->nb[3] + i02*src0->nb[2] + i01*src0->nb[1];
char * f = ( char *)dst->data + i3* dst->nb[3] + i2* dst->nb[2] + i1* dst->nb[1];
- to_float((const void *)q, (float *)f, src0->ne[0]);
+ to_float((const void *)q, (float *)f, src0->ne[0]*n_packed);
}
}
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index 14cc64db..8b6d6b1c 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -4434,14 +4434,17 @@ inline __m256i qx_r8_q8_dot_product(const __m256i * qx, const int8_t * y) {
return sumi;
}
inline __m256i q8_0_r8_dot_product(const uint8_t * x, const int8_t * y, __m256i * qx) {
- qx[0] = _mm256_loadu_si256((const __m256i *)x+0);
- qx[1] = _mm256_loadu_si256((const __m256i *)x+1);
- qx[2] = _mm256_loadu_si256((const __m256i *)x+2);
- qx[3] = _mm256_loadu_si256((const __m256i *)x+3);
- qx[4] = _mm256_loadu_si256((const __m256i *)x+4);
- qx[5] = _mm256_loadu_si256((const __m256i *)x+5);
- qx[6] = _mm256_loadu_si256((const __m256i *)x+6);
- qx[7] = _mm256_loadu_si256((const __m256i *)x+7);
+ for (int i = 0; i < 8; ++i) {
+ qx[i] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)x+i), _mm256_set1_epi8(127));
+ }
+ //qx[0] = _mm256_loadu_si256((const __m256i *)x+0);
+ //qx[1] = _mm256_loadu_si256((const __m256i *)x+1);
+ //qx[2] = _mm256_loadu_si256((const __m256i *)x+2);
+ //qx[3] = _mm256_loadu_si256((const __m256i *)x+3);
+ //qx[4] = _mm256_loadu_si256((const __m256i *)x+4);
+ //qx[5] = _mm256_loadu_si256((const __m256i *)x+5);
+ //qx[6] = _mm256_loadu_si256((const __m256i *)x+6);
+ //qx[7] = _mm256_loadu_si256((const __m256i *)x+7);
return qx_r8_q8_dot_product(qx, y);
}
template <int nrc_y>
@@ -4496,6 +4499,7 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn
for (int j = 0; j < 8; ++j) {
qx[j] = _mm512_inserti32x8(_mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)q8l[4*ib4+k].qs+j)),
_mm256_loadu_si256((const __m256i *)q8h[4*ib4+k].qs+j), 1);
+ qx[j] = _mm512_add_epi8(qx[j], _mm512_set1_epi8(127));
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto sumi = qx_r8_q8_dot_product(qx, q8.y[iy][ib4].qs+32*k);
@@ -4512,6 +4516,7 @@ static void mul_mat_q8_0_r8_q8_1(int n, const void * vx, size_t bx, const DataIn
for (int j = 0; j < 8; ++j) {
qx[j] = _mm512_inserti32x8(_mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)q8l[ib].qs+j)),
_mm256_loadu_si256((const __m256i *)q8h[ib].qs+j), 1);
+ qx[j] = _mm512_add_epi8(qx[j], _mm512_set1_epi8(127));
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto qy = (const block_q8_1 *)q8.y[iy];
@@ -6347,6 +6352,11 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
auto s1 = _mm256_sign_epi8(qx[1], qx[1]);
auto s2 = _mm256_sign_epi8(qx[2], qx[2]);
auto s3 = _mm256_sign_epi8(qx[3], qx[3]);
+#else
+ qx[0] = _mm256_add_epi8(qx[0], _mm256_set1_epi8(127));
+ qx[1] = _mm256_add_epi8(qx[1], _mm256_set1_epi8(127));
+ qx[2] = _mm256_add_epi8(qx[2], _mm256_set1_epi8(127));
+ qx[3] = _mm256_add_epi8(qx[3], _mm256_set1_epi8(127));
#endif
for (int iy = 0; iy < nrc_y; ++iy) {
auto y128 = _mm_loadu_si128((const __m128i*)q8.y[iy][ibl].qs+ib);
@@ -6425,6 +6435,11 @@ static void mul_mat_q8_KV_r8_q8_KV(int n, const void * vx, size_t bx, const Data
auto s1 = _mm256_sign_epi8(qx[1], qx[1]);
auto s2 = _mm256_sign_epi8(qx[2], qx[2]);
auto s3 = _mm256_sign_epi8(qx[3], qx[3]);
+#else
+ qx[0] = _mm256_add_epi8(qx[0], _mm256_set1_epi8(127));
+ qx[1] = _mm256_add_epi8(qx[1], _mm256_set1_epi8(127));
+ qx[2] = _mm256_add_epi8(qx[2], _mm256_set1_epi8(127));
+ qx[3] = _mm256_add_epi8(qx[3], _mm256_set1_epi8(127));
#endif
for (int iy = 0; iy < nrc_y; ++iy) {
auto y128 = _mm_loadu_si128((const __m128i*)q8y[iy]+ib);
@@ -14305,8 +14320,8 @@ struct Q4_0_R8_Dequantizer {
float32x4x2_t scales = { vcvt_f32_f16(vget_low_f16(scales16)), vcvt_f32_f16(vget_high_f16(scales16)) };
for (int j = 0; j < 4; ++j) {
auto bits = vld1q_u8_x2(iq4[4*ib4+k].qs + 32*j);
- //bits.val[0] = veorq_u8(m88, bits.val[0]);
- //bits.val[1] = veorq_u8(m88, bits.val[1]);
+ bits.val[0] = veorq_u8(m88, bits.val[0]);
+ bits.val[1] = veorq_u8(m88, bits.val[1]);
qx[2*j+0] = vshlq_n_u8(bits.val[0], 4);
qx[2*j+1] = vandq_u8(bits.val[0], m4);
qx[2*j+8] = vshlq_n_u8(bits.val[1], 4);
@@ -15305,12 +15320,12 @@ struct HelperQ80R8 : public BaseHelper<step> {
m1 = _mm256_unpackhi_epi64(t0, t1);
m2 = _mm256_unpacklo_epi64(t2, t3);
m3 = _mm256_unpackhi_epi64(t2, t3);
-#ifdef HAVE_FANCY_SIMD
- m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127));
- m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127));
- m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127));
- m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127));
-#endif
+//#ifdef HAVE_FANCY_SIMD
+// m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127));
+// m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127));
+// m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127));
+// m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127));
+//#endif
_mm256_storeu_si256((__m256i *)y[ib].qs + 0, m0);
_mm256_storeu_si256((__m256i *)y[ib].qs + 1, m1);
_mm256_storeu_si256((__m256i *)y[ib].qs + 2, m2);
@@ -15327,12 +15342,12 @@ struct HelperQ80R8 : public BaseHelper<step> {
m1 = _mm256_unpackhi_epi64(t0, t1);
m2 = _mm256_unpacklo_epi64(t2, t3);
m3 = _mm256_unpackhi_epi64(t2, t3);
-#ifdef HAVE_FANCY_SIMD
- m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127));
- m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127));
- m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127));
- m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127));
-#endif
+//#ifdef HAVE_FANCY_SIMD
+// m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127));
+// m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127));
+// m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127));
+// m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127));
+//#endif
_mm256_storeu_si256((__m256i *)y[ib].qs + 4, m0);
_mm256_storeu_si256((__m256i *)y[ib].qs + 5, m1);
_mm256_storeu_si256((__m256i *)y[ib].qs + 6, m2);
@@ -15424,12 +15439,12 @@ struct HelperQ8KVR8 : public BaseHelper<step> {
m1 = _mm256_unpackhi_epi64(t0, t1);
m2 = _mm256_unpacklo_epi64(t2, t3);
m3 = _mm256_unpackhi_epi64(t2, t3);
-#ifdef HAVE_FANCY_SIMD
- m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127));
- m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127));
- m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127));
- m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127));
-#endif
+//#ifdef HAVE_FANCY_SIMD
+// m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127));
+// m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127));
+// m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127));
+// m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127));
+//#endif
_mm256_storeu_si256((__m256i *)y[ix].qs + 4*ib+0, m0);
_mm256_storeu_si256((__m256i *)y[ix].qs + 4*ib+1, m1);
_mm256_storeu_si256((__m256i *)y[ix].qs + 4*ib+2, m2);
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index fb6a5db4..5e657f4a 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -25,6 +25,7 @@
#include <thread>
#include <atomic>
#include <unordered_map>
+#include <string>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -6766,9 +6767,7 @@ struct Modify {
modify_func_t mod_func;
int nrows;
};
-}
-
-bool iqk_modify_tensor(struct ggml_tensor * tensor) {
+const Modify * get_modify_info(ggml_type type) {
static const std::unordered_map<ggml_type, Modify> k_mod_map = {
#ifdef __ARM_NEON
{ GGML_TYPE_Q4_0_R8, {modify_q4_0_r8, 8} },
@@ -6779,10 +6778,31 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) {
{ GGML_TYPE_Q8_KV_R8, {modify_q8_KV_r8, 8} },
#endif
};
- auto it = k_mod_map.find(tensor->type);
- if (it == k_mod_map.end()) return false;
+ auto it = k_mod_map.find(type);
+ return it != k_mod_map.end() ? &it->second : nullptr;
+}
+bool is_forbidden_tensor(const std::string& name) {
+ static const std::string kTokenEmbd{"token_embd.weight"};
+ if (name == kTokenEmbd) return true;
+ //if (auto pos = name.find("attn_kv_b.weight"); pos != std::string::npos) return true;
+ return false;
+}
+}
- auto& m = it->second;
+bool iqk_should_modify_tensor([[maybe_unused]] const struct ggml_tensor * tensor) {
+ return false;
+ //if (is_forbidden_tensor(tensor->name)) return false;
+ //auto mptr = get_modify_info(tensor->type);
+ //return mptr ? true : false;
+}
+
+bool iqk_modify_tensor(struct ggml_tensor * tensor) {
+ return false;
+ auto mptr = get_modify_info(tensor->type);
+ if (!mptr) return false;
+ if (is_forbidden_tensor(std::string{tensor->name})) return false;
+
+ auto& m = *mptr;
int nrows = ggml_nrows(tensor);
int nchunks = nrows/m.nrows;
int max_thread = std::max(1, int(std::thread::hardware_concurrency()/2));
@@ -6805,12 +6825,8 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) {
return true;
}
-void iqk_repack_tensor(struct ggml_tensor * tensor) {
- constexpr int kChunk = 8;
- if (!tensor) return;
- if (!ggml_is_contiguous(tensor)) return;
- if (strncmp(tensor->name, "token_embd.weight", GGML_MAX_NAME) == 0) return;
- if (tensor->ne[1] % 4) return;
+namespace {
+const Repack * get_repack_info(ggml_type type) {
static const std::unordered_map<ggml_type, Repack> k_map = {
{ GGML_TYPE_IQ2_K, { GGML_TYPE_IQ2_K_R4, 4, (Repack::repack_func)repack_iq2_k} },
{ GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} },
@@ -6841,12 +6857,30 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
{ GGML_TYPE_F16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_half>} },
#endif
};
+ auto it = k_map.find(type);
+ return it != k_map.end() ? &it->second : nullptr;
+}
+}
+
+int iqk_repacked_type(const struct ggml_tensor * tensor) {
+ if (!ggml_is_contiguous(tensor)) return (int)tensor->type;
+ if (is_forbidden_tensor(tensor->name)) return (int)tensor->type;
+ auto rptr = get_repack_info(tensor->type);
+ return rptr && tensor->ne[1] % rptr->num_rows == 0 ? (int)rptr->new_type : (int)tensor->type;
+}
+
+void iqk_repack_tensor(struct ggml_tensor * tensor) {
+ constexpr int kChunk = 8;
+ if (!tensor) return;
+ if (!ggml_is_contiguous(tensor)) return;
+ if (is_forbidden_tensor(tensor->name)) return;
+ if (tensor->ne[1] % 4) return;
- auto it = k_map.find(tensor->type);
- if (it == k_map.end()) return;
- if (tensor->ne[1] % it->second.num_rows) return;
+ auto rptr = get_repack_info(tensor->type);
+ if (!rptr) return;
+ if (tensor->ne[1] % rptr->num_rows) return;
- auto& r = it->second;
+ auto& r = *rptr;
auto nrows = ggml_nrows(tensor);
@@ -6871,7 +6905,8 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
int last_row = std::min(first_row + chunkSize*r.num_rows, nrows);
for (int row = first_row; row < last_row; row += r.num_rows) {
std::memcpy(qtmp.data(), data + row*row_size, r.num_rows*row_size);
- r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size, true);
+ //r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size, true);
+ r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size, false);
}
}
};
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index d447705b..dd148f2e 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -245,6 +245,9 @@ void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT d
void iqk_repack_tensor(struct ggml_tensor * tensor);
bool iqk_modify_tensor(struct ggml_tensor * tensor);
+int iqk_repacked_type(const struct ggml_tensor * tensor); // int instead of ggml_type so we don't need to include ggml.h
+bool iqk_should_modify_tensor(const struct ggml_tensor * tensor);
+
// So we can re-pack Microsoft's BitNet I2_S quants
void dequantize_row_ms_i2s(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);