diff options
Diffstat (limited to 'ggml.c')
-rw-r--r-- | ggml.c | 168 |
1 files changed, 106 insertions, 62 deletions
@@ -4075,17 +4075,18 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, - struct ggml_tensor * as[], + struct ggml_tensor * const as[], + int n_as, struct ggml_tensor * ids, int id, struct ggml_tensor * b) { - int64_t n_as = ids->ne[0]; - GGML_ASSERT(ids->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_is_vector(ids)); + GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); + GGML_ASSERT(ids->ne[1] == b->ne[1]); + GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); - GGML_ASSERT(id >= 0 && id < n_as); + GGML_ASSERT(id >= 0 && id < ids->ne[0]); bool is_node = false; @@ -4097,13 +4098,14 @@ struct ggml_tensor * ggml_mul_mat_id( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne); ggml_set_op_params_i32(result, 0, id); + ggml_set_op_params_i32(result, 1, n_as); result->op = GGML_OP_MUL_MAT_ID; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = ids; result->src[1] = b; - for (int64_t i = 0; i < n_as; i++) { + for (int i = 0; i < n_as; i++) { struct ggml_tensor * a = as[i]; GGML_ASSERT(ggml_are_same_shape(as[0], a)); GGML_ASSERT(ggml_can_mul_mat(a, b)); @@ -4731,7 +4733,9 @@ struct ggml_tensor * ggml_get_rows( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[2] == b->ne[1]); + GGML_ASSERT(b->ne[3] == 1); + GGML_ASSERT(b->type == GGML_TYPE_I32); bool is_node = false; @@ -4741,7 +4745,7 @@ struct ggml_tensor * ggml_get_rows( // TODO: implement non F32 return //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); result->op = GGML_OP_GET_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -9504,8 +9508,11 @@ static bool ggml_compute_forward_mul_mat_use_blas( const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; + // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float) + // all the experts for each batch element and the processing would become incredibly slow // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && + if (dst->op != GGML_OP_MUL_MAT_ID && + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && //src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && @@ -9519,11 +9526,16 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif +// off1 = offset in i11 and i1 +// cne1 = ne11 and ne1 +// in a normal matrix multiplication, off1 = 0 and cne1 = ne1 +// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1 static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + struct ggml_tensor * dst, + int64_t off1, int64_t cne1) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -9591,10 +9603,9 @@ static void ggml_compute_forward_mul_mat( const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); - - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { float * const wdata = params->wdata; @@ -9611,10 +9622,10 @@ static void ggml_compute_forward_mul_mat( } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); + cne1, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); } } @@ -9630,6 +9641,7 @@ static void ggml_compute_forward_mul_mat( const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); assert(params->wsize >= ne11*ne12*ne13*row_size); + assert(src1->type == GGML_TYPE_F32); for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { @@ -9652,7 +9664,7 @@ static void ggml_compute_forward_mul_mat( const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = ne11*ne12*ne13; // src1 rows + const int64_t nr1 = cne1*ne12*ne13; // src1 rows //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); @@ -9694,9 +9706,9 @@ static void ggml_compute_forward_mul_mat( for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t i13 = (ir1/(ne12*ne11)); - const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; - const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + const int64_t i13 = (ir1/(ne12*cne1)); + const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; + const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1; // broadcast src0 into src1 const int64_t i03 = i13/r3; @@ -9736,20 +9748,28 @@ static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const struct ggml_tensor * ids = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - const int id = ggml_get_op_params_i32(dst, 0); + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type + ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]); + return; + } - const int a_id = ((int32_t *)ids->data)[id]; + const struct ggml_tensor * ids = src0; + const int id = ggml_get_op_params_i32(dst, 0); + const int n_as = ggml_get_op_params_i32(dst, 1); - GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); - const struct ggml_tensor * src0 = dst->src[a_id + 2]; + GGML_ASSERT(row_id >= 0 && row_id < n_as); - ggml_compute_forward_mul_mat(params, src0, src1, dst); + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1); + } } // ggml_compute_forward_out_prod @@ -10325,21 +10345,30 @@ static void ggml_compute_forward_get_rows_q( return; } - const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); + const enum ggml_type type = src0->type; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - assert( dst->ne[0] == nc); - assert( dst->ne[1] == nr); - assert(src0->nb[0] == ggml_type_size(type)); + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == ggml_type_size(type)); + assert(ggml_nrows(dst) == nr); - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; + // TODO: multi-thread + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - dequantize_row_q( - (const void *) ((char *) src0->data + r*src0->nb[1]), - (float *) ((char *) dst->data + i*dst->nb[1]), nc); + dequantize_row_q( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } + } } } @@ -10354,19 +10383,26 @@ static void ggml_compute_forward_get_rows_f16( return; } - const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + GGML_TENSOR_BINARY_OP_LOCALS - assert( dst->ne[0] == nc); - assert( dst->ne[1] == nr); - assert(src0->nb[0] == sizeof(ggml_fp16_t)); + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(ggml_fp16_t)); + assert(ggml_nrows(dst) == nr); - for (int j = 0; j < nc; ++j) { - ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v); + // TODO: multi-thread + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + ggml_fp16_to_fp32_row( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } } } } @@ -10382,19 +10418,27 @@ static void ggml_compute_forward_get_rows_f32( return; } - const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + GGML_TENSOR_BINARY_OP_LOCALS - assert( dst->ne[0] == nc); - assert( dst->ne[1] == nr); - assert(src0->nb[0] == sizeof(float)); + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(float)); + assert(ggml_nrows(dst) == nr); - ggml_vec_cpy_f32(nc, - (float *) ((char *) dst->data + i*dst->nb[1]), - (float *) ((char *) src0->data + r*src0->nb[1])); + // TODO: multi-thread + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), + (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); + } + } } } @@ -14037,11 +14081,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]); } break; case GGML_OP_MUL_MAT_ID: { - ggml_compute_forward_mul_mat_id(params, tensor); + ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_OUT_PROD: { |