summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2023-12-13 13:04:25 +0100
committerGitHub <noreply@github.com>2023-12-13 14:04:25 +0200
commit799a1cb13b0b1b560ab0ceff485caed68faa8f1f (patch)
tree18073e9ffe1f7b69ca584a9cd2cc039e79e6dc63 /ggml.c
parentfecac45658a99eddc4d6e36ba0310ca8f87a77f0 (diff)
llama : add Mixtral support (#4406)
* convert : support Mixtral as LLAMA arch * convert : fix n_ff typo * llama : model loading * ggml : sync latest ggml_mul_mat_id * llama : update graph to support MoE * llama : fix cur -> cur_expert * llama : first working version * llama : fix expert weighting in the FFN * ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only) * ggml : add n_as argument to ggml_mul_mat_id * ggml : fix ggml_get_rows to take into account ne02 / ne11 * metal : add more general support for ggml_get_rows + tests * llama : add basic support for offloading moe with CUDA * metal : add/mul/div use general kernel when src1 not cont * metal : reduce the kernel launches for ggml_mul_mat_id * ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D * ggml : update get_rows f16 and q * cuda : support non-contiguous src1 in get_rows * llama : offload missing ffn_moe_silu * metal : fix ggml_get_rows to work with non-cont src1 * metal : add indirect mat-vec kernels for all quantization types * llama : do not quantize expert gating tensors * llama : add n_expert and n_expert_used to hparams + change quants * test-backend-ops : add moe test * cuda : fix get_rows when ncols is odd * convert : determine n_ctx correctly * metal : fix ggml_mul_mat_id for F32 * test-backend-ops : make experts more evenly probable (test_moe) * test-backend-ops : cleanup, add moe test for batches * test-backend-ops : add cpy from f32 -> all types test * test-backend-ops : fix dequantize block offset * llama : fix hard-coded number of experts * test-backend-ops : simplify and disable slow tests to avoid CI timeout * test-backend-ops : disable MOE test with thread sanitizer * cuda : fix mul_mat_id with multi gpu * convert : use 1e6 rope_freq_base for mixtral * convert : fix style * convert : support safetensors format * gguf-py : bump version * metal : add cpy f16 -> f32 kernel * metal : fix binary ops for ne10 % 4 != 0 * test-backend-ops : add one more sum_rows test * ggml : do not use BLAS with ggml_mul_mat_id * convert-hf : support for mixtral-instruct (#4428) * convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct * convert : use sentencepiece tokenizer for Mixtral-instruct * convert : make flake8 happy * metal : fix soft_max kernels ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92 * metal : limit kernels to not use more than the allowed threads --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Radek Pilar <github@mrkva.eu>
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c168
1 files changed, 106 insertions, 62 deletions
diff --git a/ggml.c b/ggml.c
index eb7989dc..66658ff4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4075,17 +4075,18 @@ struct ggml_tensor * ggml_mul_mat(
struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
- struct ggml_tensor * as[],
+ struct ggml_tensor * const as[],
+ int n_as,
struct ggml_tensor * ids,
int id,
struct ggml_tensor * b) {
- int64_t n_as = ids->ne[0];
-
GGML_ASSERT(ids->type == GGML_TYPE_I32);
- GGML_ASSERT(ggml_is_vector(ids));
+ GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
+ GGML_ASSERT(ids->ne[1] == b->ne[1]);
+ GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
- GGML_ASSERT(id >= 0 && id < n_as);
+ GGML_ASSERT(id >= 0 && id < ids->ne[0]);
bool is_node = false;
@@ -4097,13 +4098,14 @@ struct ggml_tensor * ggml_mul_mat_id(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
ggml_set_op_params_i32(result, 0, id);
+ ggml_set_op_params_i32(result, 1, n_as);
result->op = GGML_OP_MUL_MAT_ID;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = ids;
result->src[1] = b;
- for (int64_t i = 0; i < n_as; i++) {
+ for (int i = 0; i < n_as; i++) {
struct ggml_tensor * a = as[i];
GGML_ASSERT(ggml_are_same_shape(as[0], a));
GGML_ASSERT(ggml_can_mul_mat(a, b));
@@ -4731,7 +4733,9 @@ struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b) {
- GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
+ GGML_ASSERT(b->ne[3] == 1);
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
bool is_node = false;
@@ -4741,7 +4745,7 @@ struct ggml_tensor * ggml_get_rows(
// TODO: implement non F32 return
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
- struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
result->op = GGML_OP_GET_ROWS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -9504,8 +9508,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
+ // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
+ // all the experts for each batch element and the processing would become incredibly slow
// TODO: find the optimal values for these
- if (ggml_is_contiguous(src0) &&
+ if (dst->op != GGML_OP_MUL_MAT_ID &&
+ ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
//src0->type == GGML_TYPE_F32 &&
src1->type == GGML_TYPE_F32 &&
@@ -9519,11 +9526,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(
}
#endif
+// off1 = offset in i11 and i1
+// cne1 = ne11 and ne1
+// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
+// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
- struct ggml_tensor * dst) {
+ struct ggml_tensor * dst,
+ int64_t off1, int64_t cne1) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
@@ -9591,10 +9603,9 @@ static void ggml_compute_forward_mul_mat(
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
+ const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
+ float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata;
@@ -9611,10 +9622,10 @@ static void ggml_compute_forward_mul_mat(
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
- ne11, ne01, ne10,
- 1.0f, y, ne10,
- x, ne00,
- 0.0f, d, ne01);
+ cne1, ne01, ne10,
+ 1.0f, y, ne10,
+ x, ne00,
+ 0.0f, d, ne01);
}
}
@@ -9630,6 +9641,7 @@ static void ggml_compute_forward_mul_mat(
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
assert(params->wsize >= ne11*ne12*ne13*row_size);
+ assert(src1->type == GGML_TYPE_F32);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -9652,7 +9664,7 @@ static void ggml_compute_forward_mul_mat(
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
const int64_t nr0 = ne01; // src0 rows
- const int64_t nr1 = ne11*ne12*ne13; // src1 rows
+ const int64_t nr1 = cne1*ne12*ne13; // src1 rows
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
@@ -9694,9 +9706,9 @@ static void ggml_compute_forward_mul_mat(
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
- const int64_t i13 = (ir1/(ne12*ne11));
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
+ const int64_t i13 = (ir1/(ne12*cne1));
+ const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
+ const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
// broadcast src0 into src1
const int64_t i03 = i13/r3;
@@ -9736,20 +9748,28 @@ static void ggml_compute_forward_mul_mat(
static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
- const struct ggml_tensor * ids = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- const int id = ggml_get_op_params_i32(dst, 0);
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
+ ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
+ return;
+ }
- const int a_id = ((int32_t *)ids->data)[id];
+ const struct ggml_tensor * ids = src0;
+ const int id = ggml_get_op_params_i32(dst, 0);
+ const int n_as = ggml_get_op_params_i32(dst, 1);
- GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+ const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
- const struct ggml_tensor * src0 = dst->src[a_id + 2];
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
- ggml_compute_forward_mul_mat(params, src0, src1, dst);
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+ ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
+ }
}
// ggml_compute_forward_out_prod
@@ -10325,21 +10345,30 @@ static void ggml_compute_forward_get_rows_q(
return;
}
- const int nc = src0->ne[0];
- const int nr = ggml_nelements(src1);
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ const int64_t nc = ne00;
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
+
const enum ggml_type type = src0->type;
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
- assert( dst->ne[0] == nc);
- assert( dst->ne[1] == nr);
- assert(src0->nb[0] == ggml_type_size(type));
+ assert(ne0 == nc);
+ assert(ne02 == ne11);
+ assert(nb00 == ggml_type_size(type));
+ assert(ggml_nrows(dst) == nr);
- for (int i = 0; i < nr; ++i) {
- const int r = ((int32_t *) src1->data)[i];
+ // TODO: multi-thread
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
- dequantize_row_q(
- (const void *) ((char *) src0->data + r*src0->nb[1]),
- (float *) ((char *) dst->data + i*dst->nb[1]), nc);
+ dequantize_row_q(
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
+ }
+ }
}
}
@@ -10354,19 +10383,26 @@ static void ggml_compute_forward_get_rows_f16(
return;
}
- const int nc = src0->ne[0];
- const int nr = ggml_nelements(src1);
+ GGML_TENSOR_BINARY_OP_LOCALS
- assert( dst->ne[0] == nc);
- assert( dst->ne[1] == nr);
- assert(src0->nb[0] == sizeof(ggml_fp16_t));
+ const int64_t nc = ne00;
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
- for (int i = 0; i < nr; ++i) {
- const int r = ((int32_t *) src1->data)[i];
+ assert(ne0 == nc);
+ assert(ne02 == ne11);
+ assert(nb00 == sizeof(ggml_fp16_t));
+ assert(ggml_nrows(dst) == nr);
- for (int j = 0; j < nc; ++j) {
- ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
+ // TODO: multi-thread
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+ ggml_fp16_to_fp32_row(
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
+ }
}
}
}
@@ -10382,19 +10418,27 @@ static void ggml_compute_forward_get_rows_f32(
return;
}
- const int nc = src0->ne[0];
- const int nr = ggml_nelements(src1);
+ GGML_TENSOR_BINARY_OP_LOCALS
- assert( dst->ne[0] == nc);
- assert( dst->ne[1] == nr);
- assert(src0->nb[0] == sizeof(float));
+ const int64_t nc = ne00;
+ const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
- for (int i = 0; i < nr; ++i) {
- const int r = ((int32_t *) src1->data)[i];
+ assert(ne0 == nc);
+ assert(ne02 == ne11);
+ assert(nb00 == sizeof(float));
+ assert(ggml_nrows(dst) == nr);
- ggml_vec_cpy_f32(nc,
- (float *) ((char *) dst->data + i*dst->nb[1]),
- (float *) ((char *) src0->data + r*src0->nb[1]));
+ // TODO: multi-thread
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+ ggml_vec_cpy_f32(nc,
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
+ }
+ }
}
}
@@ -14037,11 +14081,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_MUL_MAT:
{
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
} break;
case GGML_OP_MUL_MAT_ID:
{
- ggml_compute_forward_mul_mat_id(params, tensor);
+ ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_OUT_PROD:
{