summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c168
1 files changed, 106 insertions, 62 deletions
diff --git a/ggml.c b/ggml.c
index eb7989dc..66658ff4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4075,17 +4075,18 @@ struct ggml_tensor * ggml_mul_mat(
struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
- struct ggml_tensor * as[],
+ struct ggml_tensor * const as[],
+ int n_as,
struct ggml_tensor * ids,
int id,
struct ggml_tensor * b) {
- int64_t n_as = ids->ne[0];
-
GGML_ASSERT(ids->type == GGML_TYPE_I32);
- GGML_ASSERT(ggml_is_vector(ids));
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
+ GGML_ASSERT(ids->ne[1] == b->ne[1]);
+ GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
- GGML_ASSERT(id >= 0 && id < n_as);
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]);
bool is_node = false;
@@ -4097,13 +4098,14 @@ struct ggml_tensor * ggml_mul_mat_id(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
ggml_set_op_params_i32(result, 0, id);
+ ggml_set_op_params_i32(result, 1, n_as);
result->op = GGML_OP_MUL_MAT_ID;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = ids;
result->src[1] = b;
- for (int64_t i = 0; i < n_as; i++) {
+ for (int i = 0; i < n_as; i++) {
struct ggml_tensor * a = as[i];
GGML_ASSERT(ggml_are_same_shape(as[0], a));
GGML_ASSERT(ggml_can_mul_mat(a, b));
@@ -4731,7 +4733,9 @@ struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b) {
- GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
+ GGML_ASSERT(b->ne[3] == 1);
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
bool is_node = false;
@@ -4741,7 +4745,7 @@ struct ggml_tensor * ggml_get_rows(
// TODO: implement non F32 return
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
- struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
result->op = GGML_OP_GET_ROWS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -9504,8 +9508,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
+ // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
+ // all the experts for each batch element and the processing would become incredibly slow
// TODO: find the optimal values for these
- if (ggml_is_contiguous(src0) &&
+ if (dst->op != GGML_OP_MUL_MAT_ID &&
+ ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
//src0->type == GGML_TYPE_F32 &&
src1->type == GGML_TYPE_F32 &&
@@ -9519,11 +9526,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
}
#endif
+// off1 = offset in i11 and i1
+// cne1 = ne11 and ne1
+// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
+// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
+ struct ggml_tensor * dst,
+ int64_t off1, int64_t cne1) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
@@ -9591,10 +9603,9 @@ static void ggml_compute_forward_mul_mat(
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
+ const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
+ float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata;
@@ -9611,10 +9622,10 @@ static void ggml_compute_forward_mul_mat(
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
- ne11, ne01, ne10,
- 1.0f, y, ne10,
- x, ne00,
- 0.0f, d, ne01);
+ cne1, ne01, ne10,
+ 1.0f, y, ne10,
+ x, ne00,
+ 0.0f, d, ne01);
}
}
@@ -9630,6 +9641,7 @@ static void ggml_compute_forward_mul_mat(
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
assert(params->wsize >= ne11*ne12*ne13*row_size);
+ assert(src1->type == GGML_TYPE_F32);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -9652,7 +9664,7 @@ static void ggml_compute_forward_mul_mat(
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
const int64_t nr0 = ne01; // src0 rows
- const int64_t nr1 = ne11*ne12*ne13; // src1 rows
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
@@ -9694,9 +9706,9 @@ static void ggml_compute_forward_mul_mat(
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
- const int64_t i13 = (ir1/(ne12*ne11));
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
+ const int64_t i13 = (ir1/(ne12*cne1));
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
+ const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
// broadcast src0 into src1
const int64_t i03 = i13/r3;
@@ -9736,20 +9748,28 @@ static void ggml_compute_forward_mul_mat(
static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
- const struct ggml_tensor * ids = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- const int id = ggml_get_op_params_i32(dst, 0);
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
+ ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
+ return;
+ }
- const int a_id = ((int32_t *)ids->data)[id];
+ const struct ggml_tensor * ids = src0;
+ const int id = ggml_get_op_params_i32(dst, 0);
+ const int n_as = ggml_get_op_params_i32(dst, 1);
- GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
- const struct ggml_tensor * src0 = dst->src[a_id + 2];
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
- ggml_compute_forward_mul_mat(params, src0, src1, dst);
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+ ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
+ }
}
// ggml_compute_forward_out_prod
@@ -10325,21 +10345,30 @@ static void ggml_compute_forward_get_rows_q(
return;
}
- const int nc = src0->ne[0];
- const int nr = ggml_nelements(src1);
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ const int64_t nc = ne00;
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
+
const enum ggml_type type = src0->type;
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
- assert( dst->ne[0] == nc);
- assert( dst->ne[1] == nr);
- assert(src0->nb[0] == ggml_type_size(type));
+ assert(ne0 == nc);
+ assert(ne02 == ne11);
+ assert(nb00 == ggml_type_size(type));
+ assert(ggml_nrows(dst) == nr);
- for (int i = 0; i < nr; ++i) {
- const int r = ((int32_t *) src1->data)[i];
+ // TODO: multi-thread
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
- dequantize_row_q(
- (const void *) ((char *) src0->data + r*src0->nb[1]),
- (float *) ((char *) dst->data + i*dst->nb[1]), nc);
+ dequantize_row_q(
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
+ }
+ }
}
}
@@ -10354,19 +10383,26 @@ static void ggml_compute_forward_get_rows_f16(
return;
}
- const int nc = src0->ne[0];
- const int nr = ggml_nelements(src1);
+ GGML_TENSOR_BINARY_OP_LOCALS
- assert( dst->ne[0] == nc);
- assert( dst->ne[1] == nr);
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
+ const int64_t nc = ne00;
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
- for (int i = 0; i < nr; ++i) {
- const int r = ((int32_t *) src1->data)[i];
+ assert(ne0 == nc);
+ assert(ne02 == ne11);
+ assert(nb00 == sizeof(ggml_fp16_t));
+ assert(ggml_nrows(dst) == nr);
- for (int j = 0; j < nc; ++j) {
- ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
+ // TODO: multi-thread
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+ ggml_fp16_to_fp32_row(
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
+ }
}
}
}
@@ -10382,19 +10418,27 @@ static void ggml_compute_forward_get_rows_f32(
return;
}
- const int nc = src0->ne[0];
- const int nr = ggml_nelements(src1);
+ GGML_TENSOR_BINARY_OP_LOCALS
- assert( dst->ne[0] == nc);
- assert( dst->ne[1] == nr);
- assert(src0->nb[0] == sizeof(float));
+ const int64_t nc = ne00;
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
- for (int i = 0; i < nr; ++i) {
- const int r = ((int32_t *) src1->data)[i];
+ assert(ne0 == nc);
+ assert(ne02 == ne11);
+ assert(nb00 == sizeof(float));
+ assert(ggml_nrows(dst) == nr);
- ggml_vec_cpy_f32(nc,
- (float *) ((char *) dst->data + i*dst->nb[1]),
- (float *) ((char *) src0->data + r*src0->nb[1]));
+ // TODO: multi-thread
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+ ggml_vec_cpy_f32(nc,
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
+ }
+ }
}
}
@@ -14037,11 +14081,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_MUL_MAT:
{
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
} break;
case GGML_OP_MUL_MAT_ID:
{
- ggml_compute_forward_mul_mat_id(params, tensor);
+ ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_OUT_PROD:
{