summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c309
1 files changed, 40 insertions, 269 deletions
diff --git a/ggml.c b/ggml.c
index 093d38d0..4ee5d24a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2185,7 +2185,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"SOFT_MAX_BACK",
"ROPE",
"ROPE_BACK",
- "ALIBI",
"CLAMP",
"CONV_TRANSPOSE_1D",
"IM2COL",
@@ -2227,7 +2226,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
-static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
+static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -2276,7 +2275,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"soft_max_back(x)",
"rope(x)",
"rope_back(x)",
- "alibi(x)",
"clamp(x)",
"conv_transpose_1d(x)",
"im2col(x)",
@@ -2318,7 +2316,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
-static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
+static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -5646,7 +5644,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
- struct ggml_tensor * pos,
float scale,
float max_bias,
bool inplace) {
@@ -5660,18 +5657,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
}
- if (pos) {
- GGML_ASSERT(ggml_is_vector(pos));
- GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
- GGML_ASSERT(pos->ne[0] == a->ne[0]);
- }
-
- if (pos && mask) {
- GGML_ASSERT(pos->type == mask->type);
- }
-
if (max_bias > 0.0f) {
- GGML_ASSERT(pos);
+ GGML_ASSERT(mask);
}
bool is_node = false;
@@ -5689,7 +5676,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = mask;
- result->src[2] = pos;
return result;
}
@@ -5697,23 +5683,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx,
struct ggml_tensor * a) {
- return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
}
struct ggml_tensor * ggml_soft_max_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a) {
- return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
}
struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
- struct ggml_tensor * pos,
float scale,
float max_bias) {
- return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
+ return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
}
// ggml_soft_max_back
@@ -5928,37 +5913,6 @@ struct ggml_tensor * ggml_rope_back(
return result;
}
-// ggml_alibi
-
-struct ggml_tensor * ggml_alibi(
- struct ggml_context * ctx,
- struct ggml_tensor * a,
- int n_past,
- int n_head,
- float bias_max) {
- GGML_ASSERT(n_past >= 0);
- bool is_node = false;
-
- if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
- is_node = true;
- }
-
- // TODO: when implement backward, fix this:
- //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
- int32_t op_params[3] = { n_past, n_head };
- memcpy(op_params + 2, &bias_max, sizeof(float));
- ggml_set_op_params(result, op_params, sizeof(op_params));
-
- result->op = GGML_OP_ALIBI;
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
- result->src[0] = a;
-
- return result;
-}
-
// ggml_clamp
struct ggml_tensor * ggml_clamp(
@@ -6486,9 +6440,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * mask,
- float scale) {
+ float scale,
+ float max_bias) {
GGML_ASSERT(ggml_can_mul_mat(k, q));
// TODO: check if vT can be multiplied by (k*qT)
+
if (mask) {
GGML_ASSERT(ggml_is_contiguous(mask));
GGML_ASSERT(mask->ne[2] == 1);
@@ -6498,6 +6454,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
}
+ if (max_bias > 0.0f) {
+ GGML_ASSERT(mask);
+ }
+
bool is_node = false;
if (q->grad || k->grad || v->grad) {
@@ -6508,7 +6468,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
- float params[] = { scale };
+ float params[] = { scale, max_bias };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_FLASH_ATTN_EXT;
@@ -6528,7 +6488,7 @@ void ggml_flash_attn_ext_set_prec(
const int32_t prec_i32 = (int32_t) prec;
- ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
+ ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
}
// ggml_flash_ff
@@ -13333,7 +13293,6 @@ static void ggml_compute_forward_soft_max_f32(
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
- const struct ggml_tensor * src2 = dst->src[2];
assert(ggml_is_contiguous(dst));
assert(ggml_are_same_shape(src0, dst));
@@ -13359,8 +13318,8 @@ static void ggml_compute_forward_soft_max_f32(
// TODO: is this supposed to be ceil instead of floor?
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
- const uint32_t n_head_kv = ne02;
- const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
+ const uint32_t n_head = ne02;
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@@ -13377,13 +13336,13 @@ static void ggml_compute_forward_soft_max_f32(
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
- // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
- ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
- float * pos_f32 = src2 ? (float *) src2->data : src0->data;
-
- const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
+ const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
for (int i1 = ir0; i1 < ir1; i1++) {
+ // ALiBi
+ const uint32_t h = (i1/ne01)%ne02; // head
+ const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
+
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
@@ -13396,27 +13355,11 @@ static void ggml_compute_forward_soft_max_f32(
if (mp_f32) {
if (use_f16) {
for (int i = 0; i < nc; ++i) {
- wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
+ wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
}
} else {
for (int i = 0; i < nc; ++i) {
- wp[i] += mp_f32[i];
- }
- }
- }
-
- // ALiBi bias
- if (max_bias > 0.0f) {
- const uint32_t h = (i1/ne01)%ne02; // head
- const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
-
- if (use_f16) {
- for (int i = 0; i < nc; ++i) {
- wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
- }
- } else {
- for (int i = 0; i < nc; ++i) {
- wp[i] += slope*pos_f32[i];
+ wp[i] += slope*mp_f32[i];
}
}
}
@@ -13578,178 +13521,6 @@ static void ggml_compute_forward_soft_max_back(
}
}
-// ggml_compute_forward_alibi
-
-static void ggml_compute_forward_alibi_f32(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- assert(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
- //const int n_past = ((int32_t *) dst->op_params)[0];
- const int n_head = ((int32_t *) dst->op_params)[1];
- float max_bias;
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
- const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
- const int64_t ne1 = src0->ne[1]; // seq_len_without_past
- const int64_t ne2 = src0->ne[2]; // n_head -> this is k
- //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
-
- const int64_t n = ggml_nrows(src0);
- const int64_t ne2_ne3 = n/ne1; // ne2*ne3
-
- const size_t nb0 = src0->nb[0];
- const size_t nb1 = src0->nb[1];
- const size_t nb2 = src0->nb[2];
- //const int nb3 = src0->nb[3];
-
- GGML_ASSERT(nb0 == sizeof(float));
- GGML_ASSERT(n_head == ne2);
-
- // add alibi to src0 (KQ_scaled)
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
- for (int64_t k = 0; k < ne2_ne3; k++) {
- // TODO: k*nb2 or k*nb3
- float m_k;
-
- if (k < n_heads_log2_floor) {
- m_k = powf(m0, k + 1);
- } else {
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
- }
-
- for (int64_t i = 0; i < ne0; i++) {
- for (int64_t j = 0; j < ne1; j++) {
- float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
- pdst[0] = i * m_k + src[0];
- }
- }
- }
-}
-
-static void ggml_compute_forward_alibi_f16(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- assert(params->ith == 0);
-
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
- //const int n_past = ((int32_t *) dst->op_params)[0];
- const int n_head = ((int32_t *) dst->op_params)[1];
- float max_bias;
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
- const int ne1 = src0->ne[1]; // seq_len_without_past
- const int ne2 = src0->ne[2]; // n_head -> this is k
- //const int ne3 = src0->ne[3]; // 1 -> bsz
-
- const int n = ggml_nrows(src0);
- const int ne2_ne3 = n/ne1; // ne2*ne3
-
- const int nb0 = src0->nb[0];
- const int nb1 = src0->nb[1];
- const int nb2 = src0->nb[2];
- //const int nb3 = src0->nb[3];
-
- GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
- //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
- GGML_ASSERT(n_head == ne2);
-
- // add alibi to src0 (KQ_scaled)
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
- for (int k = 0; k < ne2_ne3; k++) {
- // TODO: k*nb2 or k*nb3
- float m_k;
-
- if (k < n_heads_log2_floor) {
- m_k = powf(m0, k + 1);
- } else {
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
- }
-
- for (int i = 0; i < ne0; i++) {
- for (int j = 0; j < ne1; j++) {
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
-
- // we return F32
- pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
- }
- }
- }
-}
-
-static void ggml_compute_forward_alibi(
- const struct ggml_compute_params * params,
- struct ggml_tensor * dst) {
-
- const struct ggml_tensor * src0 = dst->src[0];
-
- switch (src0->type) {
- case GGML_TYPE_F16:
- {
- ggml_compute_forward_alibi_f16(params, dst);
- } break;
- case GGML_TYPE_F32:
- {
- ggml_compute_forward_alibi_f32(params, dst);
- } break;
- case GGML_TYPE_BF16:
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1:
- case GGML_TYPE_Q5_0:
- case GGML_TYPE_Q5_1:
- case GGML_TYPE_Q8_0:
- case GGML_TYPE_Q8_1:
- case GGML_TYPE_Q2_K:
- case GGML_TYPE_Q3_K:
- case GGML_TYPE_Q4_K:
- case GGML_TYPE_Q5_K:
- case GGML_TYPE_Q6_K:
- case GGML_TYPE_IQ2_XXS:
- case GGML_TYPE_IQ2_XS:
- case GGML_TYPE_IQ3_XXS:
- case GGML_TYPE_IQ1_S:
- case GGML_TYPE_IQ1_M:
- case GGML_TYPE_IQ4_NL:
- case GGML_TYPE_IQ4_XS:
- case GGML_TYPE_IQ3_S:
- case GGML_TYPE_IQ2_S:
- case GGML_TYPE_Q8_K:
- case GGML_TYPE_I8:
- case GGML_TYPE_I16:
- case GGML_TYPE_I32:
- case GGML_TYPE_I64:
- case GGML_TYPE_F64:
- case GGML_TYPE_COUNT:
- {
- GGML_ASSERT(false);
- } break;
- }
-}
-
// ggml_compute_forward_clamp
static void ggml_compute_forward_clamp_f32(
@@ -15763,8 +15534,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
- float scale = 1.0f;
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
+ float scale = 1.0f;
+ float max_bias = 0.0f;
+
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
+
+ const uint32_t n_head = neq2;
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) {
@@ -15773,6 +15553,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+ const uint32_t h = iq2; // head
+ const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
+
float S = 0.0f;
float M = -INFINITY;
@@ -15796,7 +15579,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
// loop over n_kv and n_head_kv
// ref: https://arxiv.org/pdf/2112.05682.pdf
for (int64_t ic = 0; ic < nek1; ++ic) {
- const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
+ const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
if (mv == -INFINITY) {
continue;
}
@@ -15867,7 +15650,7 @@ static void ggml_compute_forward_flash_attn_ext(
const struct ggml_tensor * v,
const struct ggml_tensor * mask,
struct ggml_tensor * dst) {
- switch (dst->op_params[1]) {
+ switch (dst->op_params[2]) {
case GGML_PREC_DEFAULT:
case GGML_PREC_F32:
{
@@ -17630,10 +17413,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_rope_back(params, tensor);
} break;
- case GGML_OP_ALIBI:
- {
- ggml_compute_forward_alibi(params, tensor);
- } break;
case GGML_OP_CLAMP:
{
ggml_compute_forward_clamp(params, tensor);
@@ -18652,10 +18431,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
zero_table);
}
} break;
- case GGML_OP_ALIBI:
- {
- GGML_ASSERT(false); // TODO: not implemented
- } break;
case GGML_OP_CLAMP:
{
GGML_ASSERT(false); // TODO: not implemented
@@ -19428,10 +19203,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
{
n_tasks = n_threads;
} break;
- case GGML_OP_ALIBI:
- {
- n_tasks = 1; //TODO
- } break;
case GGML_OP_CLAMP:
{
n_tasks = 1; //TODO