diff options
author | slaren <slarengh@gmail.com> | 2024-04-18 15:18:48 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-18 15:18:48 +0200 |
commit | 0d56246f4b9764158525d894b96606f6163c53a8 (patch) | |
tree | 43e57dfbbde67b701020fc3e2ac885e846925d26 /ggml.c | |
parent | 03c0946d73c63ea73e1d85015b7088298443d438 (diff) |
ggml : group all experts in a single ggml_mul_mat_id (#6505)
* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy
* cuda : fix bin bcast with non-cont src0
* test-backend-ops : only run all mul mat tests for base types
* llama : disable moe offloading with SYCL
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'ggml.c')
-rw-r--r-- | ggml.c | 123 |
1 files changed, 62 insertions, 61 deletions
@@ -4578,21 +4578,32 @@ void ggml_mul_mat_set_prec( // ggml_mul_mat_id -// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed -// this will allow computing all the used experts in a single matrix multiplication +/* + c = ggml_mul_mat_id(ctx, as, b, ids); + + as -> [cols, rows, n_expert] + ids -> [n_experts_used, n_tokens] (i32) + b -> [cols, n_expert_used, n_tokens] + c -> [cols, n_expert_used, n_tokens] + + in b, n_experts_used can be broadcasted to match the n_expert_used of ids + + c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids +*/ struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, struct ggml_tensor * as, - struct ggml_tensor * ids, - int id, - struct ggml_tensor * b) { - + struct ggml_tensor * b, + struct ggml_tensor * ids) { + GGML_ASSERT(!ggml_is_transposed(as)); GGML_ASSERT(ids->type == GGML_TYPE_I32); + + GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert) + GGML_ASSERT(b->ne[3] == 1); // b is 3d GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d - GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row - GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); - GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id + GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat + GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast bool is_node = false; @@ -4600,11 +4611,9 @@ struct ggml_tensor * ggml_mul_mat_id( is_node = true; } - const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - ggml_set_op_params_i32(result, 0, id); - result->op = GGML_OP_MUL_MAT_ID; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = as; @@ -11009,11 +11018,6 @@ static void ggml_compute_forward_mul_mat_id( enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb10 == ggml_type_size(src1->type)); @@ -11024,22 +11028,21 @@ static void ggml_compute_forward_mul_mat_id( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - // broadcast is not supported with mmid - assert(ne12 == 1); - assert(ne13 == 1); - // row groups - const int id = ggml_get_op_params_i32(dst, 0); - const int n_as = src0->ne[2]; + const int n_ids = ids->ne[0]; // n_expert_used + const int n_as = ne02; // n_expert char * wdata_src1_end = (src1->type == vec_dot_type) ? (char *) params->wdata : (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); - int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] - int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11] + struct mmid_row_mapping { + int32_t i1; + int32_t i2; + }; - #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11] if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { @@ -11065,13 +11068,18 @@ static void ggml_compute_forward_mul_mat_id( // initialize matrix_row_counts memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] + // group rows by src0 matrix - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); + for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { + for (int id = 0; id < n_ids; ++id) { + const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]); + + assert(i02 >= 0 && i02 < n_as); - GGML_ASSERT(row_id >= 0 && row_id < n_as); - MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01; - matrix_row_counts[row_id] += 1; + MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1}; + matrix_row_counts[i02] += 1; + } } return; @@ -11089,15 +11097,13 @@ static void ggml_compute_forward_mul_mat_id( continue; } - size_t src0_offset = cur_a*src0->nb[2]; + const char * src0_cur = (const char *) src0->data + cur_a*nb02; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); - const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = cne1*ne12*ne13; // src1 rows - - //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1; // src1 rows // distribute the thread work across the inner or outer loop based on which one is larger @@ -11116,13 +11122,11 @@ static void ggml_compute_forward_mul_mat_id( const int64_t ir110 = dr1*ith1; const int64_t ir111 = MIN(ir110 + dr1, nr1); - //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); - // threads with no work simply yield (not sure if it helps) - if (ir010 >= ir011 || ir110 >= ir111) { - sched_yield(); - continue; - } + //if (ir010 >= ir011 || ir110 >= ir111) { + // sched_yield(); + // continue; + //} // block-tiling attempt const int64_t blck_0 = 16; @@ -11134,20 +11138,16 @@ static void ggml_compute_forward_mul_mat_id( for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix - const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; - const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1); - const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11); + const int64_t _i12 = ir1; // logical row index for this expert - // broadcast src0 into src1 - //const int64_t i03 = i13/r3; - //const int64_t i02 = i12/r2; + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12); + const int id = row_mapping.i1; // selected expert index - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 - const char * src0_row = (const char *) src0->data + src0_offset; + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using @@ -11155,25 +11155,26 @@ static void ggml_compute_forward_mul_mat_id( // TODO: this is a bit of a hack, we should probably have a better way to handle this const char * src1_col = (const char *) wdata + (src1_cont || src1->type != vec_dot_type - ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size - : (i11*nb11 + i12*nb12 + i13*nb13)); + ? (i11 + i12*ne11)*row_size + : (i11*nb11 + i12*nb12)); - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); //} for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1); + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } } } } - #undef MMID_MATRIX_ROW +#undef MMID_MATRIX_ROW } // ggml_compute_forward_out_prod @@ -18512,7 +18513,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa const int n_as = src0->ne[2]; cur += GGML_PAD(cur, sizeof(int64_t)); // align cur += n_as * sizeof(int64_t); // matrix_row_counts - cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows + cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows } break; case GGML_OP_OUT_PROD: { @@ -20938,12 +20939,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && cur != NULL; - ggml_set_name(cur, ctx->infos[i].name.data); - if (!ok) { break; } + ggml_set_name(cur, ctx->infos[i].name.data); + // point the data member to the appropriate location in the binary blob using the tensor infos if (!params.no_alloc) { //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file |