summaryrefslogtreecommitdiff
path: root/ggml/src
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src')
-rw-r--r--ggml/src/ggml-cuda.cu263
-rw-r--r--ggml/src/ggml-cuda/unary.cu36
-rw-r--r--ggml/src/ggml-cuda/unary.cuh2
-rw-r--r--ggml/src/ggml.c157
-rw-r--r--ggml/src/iqk/iqk_mul_mat.cpp260
-rw-r--r--ggml/src/iqk/iqk_mul_mat.h5
6 files changed, 665 insertions, 58 deletions
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index e38e9568..26d06d56 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2195,7 +2195,252 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
}
-static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
+static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * next) {
+ const ggml_tensor * src0_1 = dst->src[0];
+ const ggml_tensor * src0_2 = dst->src[1];
+ const ggml_tensor * src0 = src0_1;
+ const ggml_tensor * src1 = dst->src[2];
+ const ggml_tensor * ids = dst->src[3];
+
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_1->buffer) && "mul_mat_id does not support split buffers");
+ GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_2->buffer) && "mul_mat_id does not support split buffers");
+
+ cudaStream_t stream = ctx.stream();
+
+ const int64_t n_as = ne02;
+ const int64_t n_ids = ids->ne[0];
+
+ std::vector<char> ids_host(ggml_nbytes(ids));
+ const char * ids_dev = (const char *) ids->data;
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
+ CUDA_CHECK(cudaStreamSynchronize(stream));
+
+ ggml_tensor src0_1_row = *src0_1;
+ ggml_tensor src0_2_row = *src0_2;
+ ggml_tensor src1_row = *src1;
+ ggml_tensor dst_row = *dst;
+ ggml_tensor final_dst;
+ ggml_tensor final_src;
+
+ char * src0_1_original = (char *) src0_1->data;
+ char * src0_2_original = (char *) src0_2->data;
+ char * src1_original = (char *) src1->data;
+ char * dst_original = (char *) dst->data;
+
+ src0_1_row.ne[2] = 1;
+ src0_1_row.ne[3] = 1;
+ src0_1_row.nb[3] = nb02;
+ src0_2_row.ne[2] = 1;
+ src0_2_row.ne[3] = 1;
+ src0_2_row.nb[3] = nb02;
+
+ src1_row.ne[1] = 1;
+ src1_row.ne[2] = 1;
+ src1_row.ne[3] = 1;
+ src1_row.nb[2] = nb11;
+ src1_row.nb[3] = nb11;
+
+ dst_row.ne[1] = 1;
+ dst_row.ne[2] = 1;
+ dst_row.ne[3] = 1;
+ dst_row.nb[2] = nb1;
+ dst_row.nb[3] = nb1;
+
+ bool fuse_down = false;
+ if (next && next->op == GGML_OP_MUL_MAT_ID) {
+ //printf("Fusing MoE down gemm\n");
+ fuse_down = true;
+ final_dst = *next;
+ final_dst.ne[1] = final_dst.ne[2] = final_dst.ne[3] = 1;
+ final_dst.nb[2] = final_dst.nb[3] = final_dst.nb[1];
+ final_src = *next->src[0];
+ //printf("next->src[0]: %s, %d x %d x %d x %d and %d x %d x %d x %d\n", ggml_type_name(next->src[0]->type),
+ // (int)next->src[0]->ne[0], (int)next->src[0]->ne[1], (int)next->src[0]->ne[2], (int)next->src[0]->ne[3],
+ // (int)next->src[0]->nb[0], (int)next->src[0]->nb[1], (int)next->src[0]->nb[2], (int)next->src[0]->nb[3]);
+ final_src.ne[2] = final_src.ne[3] = 1;
+ final_src.nb[3] = final_src.nb[2];
+ }
+
+ if (ne12 == 1) {
+ ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool(), sizeof(float)*dst_row.ne[0]);
+ ggml_cuda_pool_alloc<char> dst_gate_contiguous(ctx.pool(), sizeof(float)*dst_row.ne[0]);
+ if (fuse_down) {
+ final_dst.src[1] = &dst_row;
+ }
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+ for (int64_t id = 0; id < n_ids; id++) {
+ const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+ const int64_t i11 = id % ne11;
+ const int64_t i12 = iid1;
+
+ const int64_t i1 = id;
+ const int64_t i2 = i12;
+
+ src0_1_row.data = src0_1_original + i02*nb02;
+ src0_2_row.data = src0_2_original + i02*nb02;
+ src1_row.data = src1_original + i11*nb11 + i12*nb12;
+ //dst_row.data = dst_original + i1*nb1 + i2*nb2;
+
+ dst_row.data = dst_up_contiguous.get();
+ ggml_cuda_mul_mat(ctx, &src0_1_row, &src1_row, &dst_row);
+ CUDA_CHECK(cudaGetLastError());
+
+ dst_row.data = dst_gate_contiguous.get();
+ ggml_cuda_mul_mat(ctx, &src0_2_row, &src1_row, &dst_row);
+ CUDA_CHECK(cudaGetLastError());
+
+ if (fuse_down) {
+ ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], dst_row.ne[0],
+ (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get());
+ CUDA_CHECK(cudaGetLastError());
+
+ final_src.data = (char *)next->src[0]->data + i02*next->src[0]->nb[2];
+ final_dst.data = (char *)next->data + i1*next->nb[1] + i2*next->nb[2];
+ ggml_cuda_mul_mat(ctx, &final_src, &dst_row, &final_dst);
+ CUDA_CHECK(cudaGetLastError());
+
+ } else {
+
+ ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], dst_row.ne[0],
+ (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)(dst_original + i1*nb1 + i2*nb2));
+ CUDA_CHECK(cudaGetLastError());
+
+ }
+ }
+ }
+ } else {
+ ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
+ ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
+ ggml_cuda_pool_alloc<char> dst_gate_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
+ ggml_cuda_pool_alloc<char> final_dst_contiguous(ctx.pool());
+ if (fuse_down) {
+ final_dst.data = final_dst_contiguous.alloc(ggml_nelements(next));
+ final_dst.src[1] = &dst_row;
+ }
+
+ src1_row.data = src1_contiguous.get();
+
+ bool first = false; //true;
+
+ for (int64_t i02 = 0; i02 < n_as; i02++) {
+ int64_t num_src1_rows = 0;
+
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+ for (int64_t id = 0; id < n_ids; id++) {
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+
+ GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+
+ if (row_id_i != i02) {
+ continue;
+ }
+
+ num_src1_rows++;
+ }
+ }
+
+ if (num_src1_rows == 0) {
+ continue;
+ }
+
+ ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
+ ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
+ CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
+
+ {
+ dim3 block_dims(std::min((unsigned int)ne10, 768u));
+ dim3 grid_dims(ids->ne[1], n_ids);
+ k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+ src1_original, src1_contiguous.get(),
+ dev_cur_src1_row.get(), dev_row_mapping.get(),
+ ids_dev, i02, ids->nb[1], ids->nb[0],
+ ne11, ne10,
+ nb11, nb12);
+ CUDA_CHECK(cudaGetLastError());
+ }
+
+ src0_1_row.data = src0_1_original + i02*nb02;
+ src0_2_row.data = src0_2_original + i02*nb02;
+
+ GGML_ASSERT(nb11 == sizeof(float)*ne10);
+ GGML_ASSERT(nb1 == sizeof(float)*ne0);
+
+ src1_row.ne[1] = num_src1_rows;
+ src1_row.nb[1] = nb11;
+ src1_row.nb[2] = num_src1_rows*nb11;
+ src1_row.nb[3] = num_src1_rows*nb11;
+
+ dst_row.ne[1] = num_src1_rows;
+ dst_row.nb[1] = nb1;
+ dst_row.nb[2] = num_src1_rows*nb1;
+ dst_row.nb[3] = num_src1_rows*nb1;
+
+ dst_row.data = dst_up_contiguous.get();
+ ggml_cuda_mul_mat(ctx, &src0_1_row, &src1_row, &dst_row);
+ CUDA_CHECK(cudaGetLastError());
+
+ dst_row.data = dst_gate_contiguous.get();
+ ggml_cuda_mul_mat(ctx, &src0_2_row, &src1_row, &dst_row);
+ CUDA_CHECK(cudaGetLastError());
+
+ ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
+ (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get());
+ CUDA_CHECK(cudaGetLastError());
+
+ if (fuse_down) {
+
+ final_dst.ne[1] = num_src1_rows;
+ final_dst.nb[1] = final_dst.ne[0]*sizeof(float);
+ final_dst.nb[2] = final_dst.nb[3] = num_src1_rows*final_dst.nb[1];
+ final_src.data = (char *)next->src[0]->data + i02*next->src[0]->nb[2];
+ if (first) {
+ printf("Fusing down for %d rows: (%d x %d x %d x %d) = (%d x %d x %d x %d) * (%d x %d x %d x %d)\n", (int)num_src1_rows,
+ (int)next->ne[0], (int)next->ne[1], (int)next->ne[2], (int)next->ne[3],
+ (int)next->src[0]->ne[0], (int)next->src[0]->ne[1], (int)next->src[0]->ne[2], (int)next->src[0]->ne[3],
+ (int)next->src[1]->ne[0], (int)next->src[1]->ne[1], (int)next->src[1]->ne[2], (int)next->src[1]->ne[3]);
+ printf(" using (%d x %d x %d x %d) = (%d x %d x %d x %d) * (%d x %d x %d x %d)\n",
+ (int)final_dst.ne[0], (int)final_dst.ne[1], (int)final_dst.ne[2], (int)final_dst.ne[3],
+ (int)final_src.ne[0], (int)final_src.ne[1], (int)final_src.ne[2], (int)final_src.ne[3],
+ (int)dst_row.ne[0], (int)dst_row.ne[1], (int)dst_row.ne[2], (int)dst_row.ne[3]);
+ first = false;
+ }
+ ggml_cuda_mul_mat(ctx, &final_src, &dst_row, &final_dst);
+ //ggml_cuda_mul_mat(ctx, next->src[0], &dst_row, &final_dst);
+ CUDA_CHECK(cudaGetLastError());
+
+ dim3 block_dims(std::min((unsigned int)next->ne[0], 768u));
+ dim3 grid_dims(num_src1_rows);
+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+ (char *)next->data, final_dst_contiguous.get(),
+ dev_row_mapping.get(),
+ next->ne[0],
+ next->nb[1], next->nb[2]);
+ CUDA_CHECK(cudaGetLastError());
+
+ }
+ else {
+
+ dim3 block_dims(std::min((unsigned int)ne0, 768u));
+ dim3 grid_dims(num_src1_rows);
+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+ dst_original, dst_gate_contiguous.get(),
+ dev_row_mapping.get(),
+ ne0,
+ nb1, nb2);
+ CUDA_CHECK(cudaGetLastError());
+ }
+ }
+ }
+
+ return fuse_down;
+}
+
+static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst, struct ggml_tensor * next, bool& skip_next) {
// why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
@@ -2309,6 +2554,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_MUL_MAT_ID:
ggml_cuda_mul_mat_id(ctx, dst);
break;
+ case GGML_OP_MOE_FUSED_UP_GATE:
+ skip_next = ggml_cuda_up_gate_unary(ctx, dst, next);
+ break;
case GGML_OP_SCALE:
ggml_cuda_op_scale(ctx, dst);
break;
@@ -2595,7 +2843,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
#endif
}
- if (node->op == GGML_OP_MUL_MAT_ID) {
+ if (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_MOE_FUSED_UP_GATE) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
@@ -2666,6 +2914,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (!use_cuda_graph || cuda_graph_update_required) {
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
+ ggml_tensor * next = i < cgraph->n_nodes-1 ? cgraph->nodes[i+1] : nullptr;
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
@@ -2680,11 +2929,13 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
}
#endif
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
+ bool skip_next = false;
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, next, skip_next);
if (!ok) {
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
}
GGML_ASSERT(ok);
+ if (skip_next) ++i;
}
}
@@ -2809,9 +3060,13 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_FUSED_MUL_UNARY: return ggml_is_contiguous(op->src[0]);
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
+ case GGML_OP_MOE_FUSED_UP_GATE:
{
struct ggml_tensor * a = op->src[0];
- struct ggml_tensor * b = op->src[1];
+ struct ggml_tensor * b = op->op == GGML_OP_MOE_FUSED_UP_GATE ? op->src[2] : op->src[1];
+ if (op->op == GGML_OP_MOE_FUSED_UP_GATE && a->type != op->src[1]->type) {
+ return false;
+ }
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
return false;
}
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
index 8ffddd6d..c422abbc 100644
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -297,6 +297,19 @@ void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
swiglu_f32_cuda(src0_d, dst_d, ggml_nelements(dst), dst->ne[0], src0->nb[1]/sizeof(float), stream);
}
+void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
+ int64_t nelements, const float * src0_d, const float * src1_d, float * dst_d) {
+
+ cudaStream_t stream = ctx.stream();
+
+ switch (op) {
+ case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
+ case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
+ case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
+ default: GGML_ASSERT(false);
+ }
+}
+
void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
@@ -304,19 +317,22 @@ void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor *
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src0, src1));
- cudaStream_t stream = ctx.stream();
ggml_unary_op op = (ggml_unary_op)dst->op_params[0];
- const float * src0_d = (const float *)src0->data;
- const float * src1_d = (const float *)src1->data;
- float * dst_d = (float *)dst->data;
+ ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data);
- switch (op) {
- case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
- case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
- case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
- default: GGML_ASSERT(false);
- }
+ //cudaStream_t stream = ctx.stream();
+
+ //const float * src0_d = (const float *)src0->data;
+ //const float * src1_d = (const float *)src1->data;
+ //float * dst_d = (float *)dst->data;
+
+ //switch (op) {
+ // case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
+ // case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
+ // case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
+ // default: GGML_ASSERT(false);
+ //}
}
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
index 0235a319..e55c4262 100644
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -36,5 +36,7 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
+ int64_t nelements, const float * x, const float * y, float * z);
void ggml_cuda_op_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index eb39d574..8efe2653 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3845,6 +3845,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"MUL_MAT",
"MUL_MAT_ID",
"OUT_PROD",
+ "MOE_FUSED_UP_GATE",
"SCALE",
"SET",
@@ -3904,7 +3905,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
-static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79");
+static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -3938,6 +3939,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"X*Y",
"X[i]*Y",
"X*Y",
+ "X*Y1&X*Y2",
"x*v",
"y-\\>view(x)",
@@ -3997,7 +3999,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
-static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79");
+static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -6768,6 +6770,51 @@ struct ggml_tensor * ggml_mul_mat_id(
return result;
}
+struct ggml_tensor * ggml_moe_up_gate(
+ struct ggml_context * ctx,
+ struct ggml_tensor * as_up,
+ struct ggml_tensor * as_gate,
+ struct ggml_tensor * b,
+ struct ggml_tensor * ids,
+ enum ggml_unary_op op) {
+ if (as_up->type != as_gate->type || !ggml_are_same_shape(as_up, as_gate)) {
+ struct ggml_tensor * result_up = ggml_mul_mat_id(ctx, as_up, b, ids);
+ struct ggml_tensor * result_gate = ggml_mul_mat_id(ctx, as_gate, b, ids);
+ return ggml_fused_mul_unary(ctx, result_gate, result_up, op);
+ }
+ GGML_ASSERT(!ggml_is_transposed(as_up));
+ GGML_ASSERT(!ggml_is_transposed(as_gate));
+ GGML_ASSERT(ids->type == GGML_TYPE_I32);
+
+ GGML_ASSERT(as_up->ne[3] == 1); // as is 3d (one matrix per expert)
+ GGML_ASSERT(b->ne[3] == 1); // b is 3d
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
+ GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
+ GGML_ASSERT(as_up->ne[0] == b->ne[0]); // can_mul_mat
+ GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
+
+ bool is_node = false;
+
+ if (as_up->grad || as_gate->grad || b->grad) {
+ is_node = true;
+ }
+
+ const int64_t ne[4] = { as_up->ne[1], ids->ne[0], b->ne[2], 1 };
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+ result->op = GGML_OP_MOE_FUSED_UP_GATE;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = as_up;
+ result->src[1] = as_gate;
+ result->src[2] = b;
+ result->src[3] = ids;
+
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
+
+ return result;
+}
+
+
// ggml_out_prod
struct ggml_tensor * ggml_out_prod(
@@ -14584,20 +14631,17 @@ IQK_MulMat_Not_Available:;
#if GGML_USE_IQK_MULMAT
static void ggml_compute_forward_mul_mat_id_up_gate(
const struct ggml_compute_params * params,
- struct ggml_tensor * dst1,
- struct ggml_tensor * dst2) {
+ struct ggml_tensor * dst) {
- GGML_ASSERT(dst1->src[1] == dst2->src[1]);
- GGML_ASSERT(dst1->src[2] == dst2->src[2]);
- GGML_ASSERT(dst1->src[0]->type == dst2->src[0]->type);
- GGML_ASSERT(dst1->type == GGML_TYPE_F32 && dst2->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->src[0]->type == dst->src[1]->type);
+ GGML_ASSERT(ggml_are_same_shape(dst->src[0], dst->src[1]));
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
- const struct ggml_tensor * src1 = dst1->src[1];
- const struct ggml_tensor * ids = dst1->src[2];
- const struct ggml_tensor * src0_1 = dst1->src[0];
- const struct ggml_tensor * src0_2 = dst2->src[0];
- const struct ggml_tensor * src0 = src0_1;
- const struct ggml_tensor * dst = dst1; // so GGML_TENSOR_BINARY_OP_LOCALS works
+ const struct ggml_tensor * src1 = dst->src[2];
+ const struct ggml_tensor * ids = dst->src[3];
+ const struct ggml_tensor * src0_1 = dst->src[0];
+ const struct ggml_tensor * src0_2 = dst->src[1];
+ const struct ggml_tensor * src0 = src0_1; // so GGML_TENSOR_BINARY_OP_LOCALS works
GGML_TENSOR_BINARY_OP_LOCALS
@@ -14680,6 +14724,9 @@ static void ggml_compute_forward_mul_mat_id_up_gate(
ggml_barrier(params->shared);
+
+ // so GGML_TENSOR_BINARY_OP_LOCALS works
+
// compute each matrix multiplication in sequence
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
const int64_t cne1 = matrix_row_counts[cur_a];
@@ -14696,28 +14743,34 @@ static void ggml_compute_forward_mul_mat_id_up_gate(
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1; // src1 rows
-
- if (nth%2 == 0) {
- const char * src0_d = ith%2 == 0 ? src0_1_cur : src0_2_cur;
- void * dst_d = ith%2 == 0 ? dst1->data : dst2->data;
- if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11,
- type, src0_d, nb01,
- vec_dot_type, (const char *)wdata, row_size,
- (float *)dst_d, nb1, nb2,
- matrix_rows + cur_a*ne12, ith/2, nth/2)) GGML_ABORT("fatal error");
-
- } else {
- if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11,
- src0_1->type, (const char *)src0_1_cur, nb01,
- vec_dot_type, (const char *)wdata, row_size,
- (float *)dst1->data, nb1, nb2,
- matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error");
- if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11,
- src0_2->type, (const char *)src0_2_cur, nb01,
- vec_dot_type, (const char *)wdata, row_size,
- (float *)dst2->data, nb1, nb2,
- matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error");
- }
+ //
+ if (!iqk_moe_fused_up_gate(nr0, nr1, ne00, ne11, dst->op_params[0],
+ type, src0_1_cur, src0_2_cur, nb01,
+ vec_dot_type, (const char *)wdata, row_size,
+ (float *)dst->data, nb1, nb2,
+ matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error");
+
+// if (nth%2 == 0) {
+// const char * src0_d = ith%2 == 0 ? src0_1_cur : src0_2_cur;
+// void * dst_d = ith%2 == 0 ? dst1->data : dst2->data;
+// if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11,
+// type, src0_d, nb01,
+// vec_dot_type, (const char *)wdata, row_size,
+// (float *)dst_d, nb1, nb2,
+// matrix_rows + cur_a*ne12, ith/2, nth/2)) GGML_ABORT("fatal error");
+//
+// } else {
+// if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11,
+// src0_1->type, (const char *)src0_1_cur, nb01,
+// vec_dot_type, (const char *)wdata, row_size,
+// (float *)dst1->data, nb1, nb2,
+// matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error");
+// if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11,
+// src0_2->type, (const char *)src0_2_cur, nb01,
+// vec_dot_type, (const char *)wdata, row_size,
+// (float *)dst2->data, nb1, nb2,
+// matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error");
+// }
}
#undef MMID_MATRIX_ROW
@@ -19152,6 +19205,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
static bool ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_tensor * next) {
GGML_ASSERT(params);
+ GGML_UNUSED(next);
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
return false;
@@ -19269,16 +19323,12 @@ static bool ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_MUL_MAT_ID:
{
-#if GGML_USE_IQK_MULMAT
- if (next && next->op == GGML_OP_MUL_MAT_ID && tensor->src[1] == next->src[1] &&
- tensor->src[0]->type == next->src[0]->type) {
- ggml_compute_forward_mul_mat_id_up_gate(params, tensor, next);
- skip_next = true;
- break;
- }
-#endif
ggml_compute_forward_mul_mat_id(params, tensor);
} break;
+ case GGML_OP_MOE_FUSED_UP_GATE:
+ {
+ ggml_compute_forward_mul_mat_id_up_gate(params, tensor);
+ } break;
case GGML_OP_OUT_PROD:
{
ggml_compute_forward_out_prod(params, tensor);
@@ -20036,6 +20086,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ABORT("fatal error"); // TODO: not implemented
}
+ case GGML_OP_MOE_FUSED_UP_GATE:
+ {
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_OUT_PROD:
{
GGML_ABORT("fatal error"); // TODO: not implemented
@@ -21046,6 +21100,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_CONCAT:
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
+ case GGML_OP_MOE_FUSED_UP_GATE:
case GGML_OP_OUT_PROD:
{
n_tasks = n_threads;
@@ -21249,6 +21304,20 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
} break;
+ case GGML_OP_MOE_FUSED_UP_GATE:
+ {
+ cur = 0;
+ const struct ggml_tensor * src0 = node->src[0];
+ const struct ggml_tensor * src2 = node->src[2];
+ const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
+ if (src2->type != vec_dot_type) {
+ cur += ggml_row_size(vec_dot_type, node->src[1]->ne[0]) * ggml_nrows(node->src[1]);
+ }
+ const int n_as = src0->ne[2];
+ cur += GGML_PAD(cur, sizeof(int64_t)); // align
+ cur += n_as * sizeof(int64_t); // matrix_row_counts
+ cur += n_as * src2->ne[2] * sizeof(int64_t); // matrix_rows
+ } break;
case GGML_OP_OUT_PROD:
{
if (ggml_is_quantized(node->src[0]->type)) {
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index ed7309cd..0f7cd1e5 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -217,6 +217,118 @@ struct MulMat {
funcs[n_left-1](n, vx, bx, info, nrc_x);
}
}
+ inline void gelu(int n, const float * src, float * dst);
+ inline void relu(int n, const float * src, float * dst);
+ inline void silu(int n, const float * src, float * dst);
+ inline void activate(ggml_unary_op op, int n, const float * src, float * dst) {
+ if (op == GGML_UNARY_OP_GELU) gelu(n, src, dst);
+ else if (op == GGML_UNARY_OP_RELU) relu(n, src, dst);
+ else if (op == GGML_UNARY_OP_SILU) silu(n, src, dst);
+ else GGML_ABORT("fatal error");
+ }
+ inline void mul_mat_up_gate_NxM(int n, const void * vx_up, const void * vx_gate, size_t bx, DataInfo& info, int nrc_x, int nrc_y, int unary_op) {
+#ifdef __aarch64__
+ constexpr int k_x_step = 64; //8192; // Tiling does not seem to help on my M2 Max (but difference to tiling is small)
+#else
+ constexpr int k_x_step = 64; // This works best on my Ryzen-7950X (but differences to other tile size are small)
+#endif
+ auto op = ggml_unary_op(unary_op);
+ float tmp[k_x_step*16];
+ if (func16 && nrc_y >= 16) {
+ int n_step = (nrc_y - info.cur_y)/16;
+ for (int ix = 0; ix < nrc_x; ix += k_x_step) {
+ auto this_info = info;
+ this_info.s += ix;
+ int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
+ for (int iy = 0; iy < n_step; ++iy) {
+ func16(n, (const void *)((const char *)vx_gate + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < 16; ++ky) {
+ activate(op, this_nrc_x, this_info.dst_row(ky), tmp + ky*k_x_step);
+ }
+ func16(n, (const void *)((const char *)vx_up + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < 16; ++ky) {
+ auto result = this_info.dst_row(ky);
+ for (int j = 0; j < this_nrc_x; ++j) result[j] *= tmp[ky*k_x_step + j];
+ }
+ this_info.cur_y += 16;
+ }
+ }
+ info.cur_y += 16 * n_step;
+ if (info.cur_y == nrc_y) return;
+ }
+ int ny = funcs.size();
+ while (!funcs[ny-1] && ny > 0) --ny;
+ int n_left = nrc_y - info.cur_y;
+ int n_step = n_left/ny;
+ if (n_step > 0) {
+ if (n_step*ny != n_left) {
+ ++n_step;
+ int ny1 = n_left/n_step;
+ int ny2 = ny1 + 1;
+ int my1 = n_step*ny2 - n_left;
+ int my2 = n_step - my1;
+ for (int ix = 0; ix < nrc_x; ix += k_x_step) {
+ auto this_info = info;
+ this_info.s += ix;
+ int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
+ for (int iy = 0; iy < my1; ++iy) {
+ funcs[ny1-1](n, (const void *)((const char *)vx_gate + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < ny1; ++ky) activate(op, this_nrc_x, this_info.dst_row(ky), tmp + ky*k_x_step);
+ funcs[ny1-1](n, (const void *)((const char *)vx_up + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < ny1; ++ky) {
+ auto result = this_info.dst_row(ky);
+ for (int j = 0; j < this_nrc_x; ++j) result[j] *= tmp[ky*k_x_step + j];
+ }
+ this_info.cur_y += ny1;
+ }
+ for (int iy = 0; iy < my2; ++iy) {
+ funcs[ny2-1](n, (const void *)((const char *)vx_gate + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < ny2; ++ky) activate(op, this_nrc_x, this_info.dst_row(ky), tmp + ky*k_x_step);
+ funcs[ny2-1](n, (const void *)((const char *)vx_up + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < ny2; ++ky) {
+ auto result = this_info.dst_row(ky);
+ for (int j = 0; j < this_nrc_x; ++j) result[j] *= tmp[ky*k_x_step + j];
+ }
+ this_info.cur_y += ny2;
+ }
+ }
+ info.cur_y += n_left;
+ }
+ else {
+ for (int ix = 0; ix < nrc_x; ix += k_x_step) {
+ auto this_info = info;
+ this_info.s += ix;
+ int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
+ for (int iy = 0; iy < n_step; ++iy) {
+ funcs[ny-1](n, (const void *)((const char *)vx_gate + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < ny; ++ky) activate(op, this_nrc_x, this_info.dst_row(ky), tmp + ky*k_x_step);
+ funcs[ny-1](n, (const void *)((const char *)vx_up + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < ny; ++ky) {
+ auto result = this_info.dst_row(ky);
+ for (int j = 0; j < this_nrc_x; ++j) result[j] *= tmp[ky*k_x_step + j];
+ }
+ this_info.cur_y += ny;
+ }
+ }
+ info.cur_y += ny * n_step;
+ }
+ }
+ n_left = nrc_y - info.cur_y;
+ if (n_left > 0) {
+ for (int ix = 0; ix < nrc_x; ix += k_x_step) {
+ auto this_info = info;
+ this_info.s += ix;
+ int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
+ funcs[n_left-1](n, (const void *)((const char *)vx_gate + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < n_left; ++ky) activate(op, this_nrc_x, this_info.dst_row(ky), tmp + ky*k_x_step);
+ funcs[n_left-1](n, (const void *)((const char *)vx_up + ix*bx), bx, this_info, this_nrc_x);
+ for (int ky = 0; ky < n_left; ++ky) {
+ auto result = this_info.dst_row(ky);
+ for (int j = 0; j < this_nrc_x; ++j) result[j] *= tmp[ky*k_x_step + j];
+ }
+ }
+ }
+ }
static bool prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny);
static inline int num_rows(ggml_type type) {
#ifdef HAVE_FANCY_SIMD
@@ -414,6 +526,34 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11,
return true;
}
+bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int ne11, int unary_op,
+ int typeA, const void * Aup, const void * Agate, long strideA,
+ int typeB, const void * B, long strideB,
+ float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
+
+ const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
+ assert(row_mapping != nullptr);
+
+ MulMat mm;
+ if (!MulMat::prepare(typeA, typeB, ne00, mm, Ny)) {
+ return false;
+ }
+ size_t row_size_qx = strideA;
+ size_t row_size_qy = strideB;
+ auto num_rows = MulMat::num_rows(ggml_type(typeA));
+ GGML_ASSERT(Nx%num_rows == 0);
+ auto nrc_x = (Nx/num_rows + nth - 1)/nth;
+ auto first_x = ith*nrc_x;
+ if (first_x + nrc_x > Nx/num_rows) nrc_x = Nx/num_rows - first_x;
+ first_x *= num_rows;
+ nrc_x *= num_rows;
+ DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float),
+ row_size_qy, 0, ne11, row_mapping, nb2/sizeof(float)};
+ mm.mul_mat_up_gate_NxM(ne00, (const char *)Aup + row_size_qx*first_x, (const char *)Agate + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny, unary_op);
+ return true;
+}
+
+
namespace {
inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
@@ -14660,6 +14800,45 @@ inline float32x4_t v_tanh(float16x8_t x) {
auto val2 = v_tanh(vcvt_f32_f16(vget_high_f16(x)));
return vcombine_f16(vcvt_f16_f32(val1), vcvt_f16_f32(val2));
}
+inline float32x4_t v_silu(float32x4_t x) {
+ const float32x4_t one = vdupq_n_f32(1.0f);
+ const float32x4_t zero = vdupq_n_f32(0.0f);
+ const float32x4_t neg_x = vsubq_f32(zero, x);
+ const float32x4_t exp_neg_x = v_expf(neg_x);
+ const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
+ return vdivq_f32(x, one_plus_exp_neg_x);
+}
+inline float32x4_t v_gelu(float32x4_t x, float32x4_t c1, float32x4_t c2) {
+ const float32x4_t one = vdupq_n_f32(1.0f);
+ float32x4_t arg = vfmaq_f32(one, c1, vmulq_f32(x, x));
+ arg = vmulq_f32(arg, vmulq_f32(x, c2));
+ float32x4_t exp_arg = v_expf(arg);
+ float32x4_t gelu = vmulq_f32(x, vdivq_f32(exp_arg, vaddq_f32(exp_arg, one)));
+ uint32x4_t mask = vcgtq_f32(x, vdupq_n_f32(10.f));
+ return vbslq_f32(mask, x, gelu);
+}
+
+void MulMat::gelu(int n, const float * x, float * y) {
+ constexpr float GELU_COEF_A = 0.044715f;
+ constexpr float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+ int i = 0;
+ auto c1 = vdupq_n_f32(GELU_COEF_A);
+ auto c2 = vdupq_n_f32(2.f*SQRT_2_OVER_PI);
+ for (; i + 3 < n; i += 4) {
+ vst1q_f32(y + i, v_gelu(vld1q_f32(x + i), c1, c2));
+ }
+ for (; i < n; ++i) y[i] = 0.5f*x[i]*(1.0f + tanhf(SQRT_2_OVER_PI*x[i]*(1.0f + GELU_COEF_A*x[i]*x[i])));
+}
+
+void MulMat::silu(int n, const float * x, float * y) {
+ int i = 0;
+ for (; i + 3 < n; i += 4) vst1q_f32(y + i, v_silu(vld1q_f32(x + i)));
+ for (; i < n; ++i) y[i] = x[i]/(1.0f + expf(-x[i]));
+}
+
+void MulMat::relu(int n, const float * x, float * y) {
+ for (int j = 0; j < n; ++j) y[j] = x[j] > 0 ? x[j] : 0;
+}
#endif
#if defined(__AVX512F__) && defined(__AVX512DQ__)
@@ -14702,6 +14881,24 @@ inline __m512 v_tanh(__m512 x) {
const __m512 res = _mm512_div_ps(_mm512_sub_ps(exp_two_x, one), _mm512_add_ps(exp_two_x, one));
return _mm512_mask_blend_ps(mask, res, one);
}
+inline __m512 v_gelu(__m512 x, __m512 c1, __m512 c2) {
+ const __m512 one = _mm512_set1_ps(1.0f);
+ __m512 arg = _mm512_fmadd_ps(x, _mm512_mul_ps(c1, x), one);
+ //__m512 arg = _mm512_add_ps(one, _mm512_mul_ps(_mm512_mul_ps(x, x), c1));
+ arg = _mm512_mul_ps(arg, _mm512_mul_ps(c2, x));
+ const __mmask16 mask = _mm512_cmp_ps_mask(arg, _mm512_set1_ps(30.f), _CMP_GT_OQ);
+ const __m512 exp_arg = v_expf(arg);
+ const __m512 ratio = _mm512_div_ps(exp_arg, _mm512_add_ps(exp_arg, one));
+ return _mm512_mul_ps(x, _mm512_mask_blend_ps(mask, ratio, one));
+}
+inline static __m512 v_silu(__m512 x) {
+ const __m512 one = _mm512_set1_ps(1);
+ const __m512 zero = _mm512_setzero_ps();
+ const __m512 neg_x = _mm512_sub_ps(zero, x);
+ const __m512 exp_neg_x = v_expf(neg_x);
+ const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
+ return _mm512_div_ps(x, one_plus_exp_neg_x);
+}
#endif
#if defined(__AVX2__) && defined(__FMA__)
@@ -14755,6 +14952,61 @@ inline __m256 v_tanh(__m256 x) {
const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
return _mm256_or_ps(_mm256_and_ps(mask, one), _mm256_andnot_ps(mask, res));
}
+inline static __m256 v_gelu(__m256 x, __m256 c1, __m256 c2) {
+ const __m256 one = _mm256_set1_ps(1.0f);
+ const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
+ __m256 arg = _mm256_add_ps(one, _mm256_mul_ps(_mm256_mul_ps(x, x), c1));
+ arg = _mm256_mul_ps(arg, _mm256_mul_ps(x, c2));
+ __m256 exp_arg = v_expf(arg);
+ __m256 gelu = _mm256_mul_ps(x, _mm256_div_ps(exp_arg, _mm256_add_ps(exp_arg, one)));
+ return _mm256_or_ps(_mm256_and_ps(mask, x), _mm256_andnot_ps(mask, gelu));
+}
+inline static __m256 v_silu(__m256 x) {
+ const __m256 one = _mm256_set1_ps(1);
+ const __m256 zero = _mm256_setzero_ps();
+ const __m256 neg_x = _mm256_sub_ps(zero, x);
+ const __m256 exp_neg_x = v_expf(neg_x);
+ const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
+ return _mm256_div_ps(x, one_plus_exp_neg_x);
+}
+
+void MulMat::gelu(int n, const float * x, float * y) {
+ constexpr float GELU_COEF_A = 0.044715f;
+ constexpr float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+ //GGML_ASSERT(n%8 == 0);
+ int i = 0;
+#if defined __AVX512F__ && defined __AVX512DQ__
+ {
+ __m512 c1 = _mm512_set1_ps(GELU_COEF_A);
+ __m512 c2 = _mm512_set1_ps(2.f*SQRT_2_OVER_PI);
+ for (; i + 15 < n; i += 16) _mm512_storeu_ps(y + i, v_gelu(_mm512_loadu_ps(x + i), c1, c2));
+ }
+#endif
+#if defined __AVX2__ && defined __FMA__
+ if (i + 7 < n) {
+ __m256 c1 = _mm256_set1_ps(GELU_COEF_A);
+ __m256 c2 = _mm256_set1_ps(2.f*SQRT_2_OVER_PI);
+ for (; i + 7 < n; i += 8) _mm256_storeu_ps(y + i, v_gelu(_mm256_loadu_ps(x + i), c1, c2));
+
+ }
+#endif
+ for (; i < n; ++i) y[i] = 0.5f*x[i]*(1.0f + tanhf(SQRT_2_OVER_PI*x[i]*(1.0f + GELU_COEF_A*x[i]*x[i])));
+}
+
+void MulMat::silu(int n, const float * x, float * y) {
+ int i = 0;
+#if defined __AVX512F__ && defined __AVX512DQ__
+ for (; i + 15 < n; i += 16) _mm512_storeu_ps(y + i, v_silu(_mm512_loadu_ps(x + i)));
+#endif
+#if defined __AVX2__ && defined __FMA__
+ for (; i + 7 < n; i += 8) _mm256_storeu_ps(y + i, v_silu(_mm256_loadu_ps(x + i)));
+#endif
+ for (; i < n; ++i) y[i] = x[i]/(1.0f + expf(-x[i]));
+}
+
+void MulMat::relu(int n, const float * x, float * y) {
+ for (int j = 0; j < n; ++j) y[j] = x[j] > 0 ? x[j] : 0;
+}
#endif
} // namespace
@@ -17107,6 +17359,14 @@ bool iqk_mul_mat_moe(long, long, long, int, int, const void *, long, int, const
return false;
}
+bool iqk_moe_fused_up_gate(long /*Nx*/, long /*Ny*/, long /*ne00*/, int /*ne11*/, int /*unary_op*/,
+ int /*typeA*/, const void * /*Aup*/, const void * /*Agate*/, long /*strideA*/,
+ int /*typeB*/, const void * /*B*/, long /*strideB*/,
+ float * /*C*/, long /*nb1*/, long /*nb2*/, const void * /*vrow_mapping*/, int /*ith*/, int /*nth*/) {
+ return false;
+}
+
+
bool iqk_flash_attn_noalibi([[maybe_unused]] int int_type_k, // type of k
[[maybe_unused]] int int_type_v, // type of v
[[maybe_unused]] int D, // head size
diff --git a/ggml/src/iqk/iqk_mul_mat.h b/ggml/src/iqk/iqk_mul_mat.h
index d5f340b2..767f89cf 100644
--- a/ggml/src/iqk/iqk_mul_mat.h
+++ b/ggml/src/iqk/iqk_mul_mat.h
@@ -28,6 +28,11 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11,
int typeB, const void * B, long strideB,
float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth);
+bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int ne11, int unary_op,
+ int typeA, const void * Aup, const void * Agate, long strideA,
+ int typeB, const void * B, long strideB,
+ float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth);
+
bool iqk_flash_attn_noalibi(int type_k, // type of k
int type_v, // type of v
int Dk, // K head size