summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c335
1 files changed, 170 insertions, 165 deletions
diff --git a/ggml.c b/ggml.c
index dadb3075..9a787863 100644
--- a/ggml.c
+++ b/ggml.c
@@ -123,6 +123,8 @@ typedef void * thread_ret_t;
#define GGML_GELU_FP16
#define GGML_GELU_QUICK_FP16
#define GGML_SILU_FP16
+// #define GGML_CROSS_ENTROPY_EXP_FP16
+// #define GGML_FLASH_ATTN_EXP_FP16
#define GGML_SOFT_MAX_UNROLL 4
#define GGML_VEC_DOT_UNROLL 2
@@ -186,8 +188,8 @@ typedef void * thread_ret_t;
//
#if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
+#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
+#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else
inline static void * ggml_aligned_malloc(size_t size) {
void * aligned_memory = NULL;
@@ -212,8 +214,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
}
return aligned_memory;
}
-#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-#define GGML_ALIGNED_FREE(ptr) free(ptr)
+#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
#define UNUSED GGML_UNUSED
@@ -5857,7 +5859,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
struct ggml_tensor * ggml_rms_norm_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
- struct ggml_tensor * b) {
+ struct ggml_tensor * b,
+ float eps) {
bool is_node = false;
if (a->grad) {
@@ -5867,6 +5870,8 @@ struct ggml_tensor * ggml_rms_norm_back(
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+ ggml_set_op_params(result, &eps, sizeof(eps));
+
result->op = GGML_OP_RMS_NORM_BACK;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
@@ -9443,6 +9448,8 @@ static void ggml_compute_forward_div_f32(
#ifdef GGML_USE_ACCELERATE
+ UNUSED(ggml_vec_div_f32);
+
vDSP_vdiv(
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -10749,7 +10756,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
GGML_TENSOR_BINARY_OP_LOCALS;
- const float eps = 1e-6f; // TODO: make this a parameter
+ float eps;
+ memcpy(&eps, dst->op_params, sizeof(float));
// TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -12139,6 +12147,7 @@ static void ggml_compute_forward_soft_max_back_f32(
// dx = J * dy
// dxk = sum_i(Jki * dyi)
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
+ // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
// dxk = -yk * dot(y, dy) + yk*dyk
@@ -13929,7 +13938,7 @@ static void ggml_compute_forward_flash_attn_f32(
vvexpf(S, S, &Mup);
ggml_vec_sum_f32(Mup, &sum, S);
#else
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -13939,9 +13948,13 @@ static void ggml_compute_forward_flash_attn_f32(
if (SS[j] == -INFINITY) {
SS[j] = 0.0f;
} else {
+#ifndef GGML_FLASH_ATTN_EXP_FP16
+ const float val = expf(SS[j] - max);
+#else
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
memcpy(&scvt[j], &s, sizeof(uint16_t));
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+#endif
sump[j] += (ggml_float)val;
SS[j] = val;
}
@@ -14519,7 +14532,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
vvexpf(SM, SM, &Mup);
ggml_vec_sum_f32(Mup, &sum, SM);
#else
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -14530,9 +14543,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
if (SR[j] == -INFINITY) {
SW[j] = 0.0f;
} else {
+#ifndef GGML_FLASH_ATTN_EXP_FP16
+ const float val = expf(SR[j] - max);
+#else
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
memcpy(&scvt[j], &s, sizeof(uint16_t));
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+#endif
sump[j] += (ggml_float)val;
SW[j] = val;
}
@@ -15270,6 +15287,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
const int nc = src0->ne[0];
const int nr = ggml_nrows(src0);
+ GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
+
if (params->type == GGML_TASK_INIT) {
if (ith == 0) {
memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -15281,7 +15300,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
if (ith == 0) {
float * dp = (float *) dst->data;
ggml_vec_sum_f32(nth, dp, sums);
- dp[0] *= -1.0f;
+ dp[0] *= -1.0f / (float) nr;
}
return;
}
@@ -15298,7 +15317,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
for (int i1 = ir0; i1 < ir1; i1++) {
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
- float * st = (float *) params->wdata + nth + ith*nc;
+ float * st = ((float *) params->wdata) + nth + ith*nc;
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
@@ -15313,15 +15332,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
float max = -INFINITY;
ggml_vec_max_f32(nc, &max, s0);
- uint16_t scvt;
+ uint16_t scvt; UNUSED(scvt);
for (int i = 0; i < nc; i++) {
if (s0[i] == -INFINITY) {
st[i] = 0.0f;
} else {
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
+ const float s = s0[i] - max;
+ const float val = expf(s);
+#else
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
memcpy(&scvt, &s, sizeof(scvt));
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+#endif
sum += (ggml_float)val;
st[i] = val;
}
@@ -15337,7 +15360,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
ggml_vec_log_f32(nc, st, st);
ggml_vec_mul_f32(nc, st, st, s1);
- ggml_vec_sum_f32(nc, sums + ith, st);
+ float st_sum = 0;
+ ggml_vec_sum_f32(nc, &st_sum, st);
+ sums[ith] += st_sum;
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
@@ -15387,7 +15412,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
return;
}
- const float eps = 1e-9f;
+ const double eps = 1e-9;
// TODO: handle transposed/permuted matrices
const int64_t nc = src0->ne[0];
@@ -15406,7 +15431,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
- float * sm = (float *) params->wdata + ith*nc;
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
@@ -15415,54 +15439,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
assert(!isnan(s1[i]));
}
#endif
- // step by step explanation:
- {
- //float * sums = (float *) params->wdata;
-
- // forward pass with annotated gradients from backward pass
- // (built by going in reverse operation order, adding to gradients of current operation args)
- // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
- // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
- // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
- // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
- // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
- // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
- // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
- // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
-
- // substitute into grad[st1], because we can reuse softmax_back from this point on
- // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
- // postorder:
- // grad[st1] := softmax(s0)
- // grad[st1] := grad[st1]*(1.0 - eps)
- // grad[st1] := grad[st1] + eps
- // grad[st1] := s1 / grad[st1]
- // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
-
- // src0 gradients by going through softmax_back
- // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
- // from softmax_back:
- // dxk = yk * (dyk - dot(y, dy))
- // dot_y_dy := dot(y, dy)
- // dx := dy
- // dx := dx - dot_y_dy
- // dx := dx * y
- // postorder:
- // dot_st1_dst1 := dot(st1, grad[st1])
- // grad[s0] := grad[st1]
- // grad[s0] := grad[s0] - dot_st1_dst1
- // grad[s0] := grad[s0] * st1
-
- // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
- // sm := softmax(s0)
- // grad[s0] := sm*(1.0 - eps)
- // grad[s0] := grad[s0] + eps
- // grad[s0] := s1 / grad[s0]
- // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
- // dot_st1_dst1 := dot(sm, grad[s0])
- // grad[s0] := grad[s0] - dot_st1_dst1
- // grad[s0] := grad[s0] * sm
- }
// soft_max
ggml_float sum = 0.0;
@@ -15470,39 +15446,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
float max = -INFINITY;
ggml_vec_max_f32(nc, &max, s0);
- uint16_t scvt;
+ uint16_t scvt; UNUSED(scvt);
for (int i = 0; i < nc; i++) {
if (s0[i] == -INFINITY) {
- sm[i] = 0.0f;
+ ds0[i] = 0.0f;
} else {
- // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
+ const float s = s0[i] - max;
+ const float val = expf(s);
+#else
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
memcpy(&scvt, &s, sizeof(scvt));
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+#endif
sum += (ggml_float)val;
- sm[i] = val;
+ ds0[i] = val;
}
}
assert(sum > 0.0);
- sum = 1.0/sum;
+ sum = (1.0 - eps)/sum;
}
- float dot_st1_dst1 = 0;
- ggml_vec_scale_f32(nc, sm, sum);
- ggml_vec_cpy_f32 (nc, ds0, sm);
- ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
- ggml_vec_add1_f32 (nc, ds0, ds0, eps);
- ggml_vec_div_f32 (nc, ds0, s1, ds0);
- ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
- ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
- ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
- ggml_vec_mul_f32 (nc, ds0, ds0, sm);
+ // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
+ ggml_vec_scale_f32(nc, ds0, sum);
+ ggml_vec_add1_f32(nc, ds0, ds0, eps);
+ ggml_vec_sub_f32(nc, ds0, ds0, s1);
+ ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
+
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
- assert(!isnan(sm[i]));
- assert(!isinf(sm[i]));
assert(!isnan(ds0[i]));
assert(!isinf(ds0[i]));
}
@@ -16057,9 +16031,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
// necessary for llama
if (src0->grad) {
+ float eps;
+ memcpy(&eps, tensor->op_params, sizeof(float));
+
src0->grad = ggml_add_impl(ctx,
src0->grad,
- ggml_rms_norm_back(ctx, src0, tensor->grad),
+ ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
inplace);
}
} break;
@@ -16827,9 +16804,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
return result;
}
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
- struct ggml_cgraph result = *gf;
-
+void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
GGML_ASSERT(gf->n_nodes > 0);
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -16853,15 +16828,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
}
}
- for (int i = gf->n_nodes - 1; i >= 0; i--) {
+ for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i];
if (node->is_param) {
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
- ggml_build_forward_expand(&result, node->grad);
+ ggml_build_forward_expand(gb, node->grad);
}
}
+}
+struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
+ struct ggml_cgraph result = *gf;
+ ggml_build_backward_expand(ctx, gf, &result, keep);
return result;
}
@@ -17537,10 +17516,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
{
n_tasks = n_threads;
-
- size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
-
- work_size = MAX(work_size, cur);
} break;
case GGML_OP_NONE:
{
@@ -18418,14 +18393,16 @@ static enum ggml_opt_result ggml_opt_adam(
struct ggml_opt_params params,
struct ggml_tensor * f,
struct ggml_cgraph * gf,
- struct ggml_cgraph * gb) {
+ struct ggml_cgraph * gb,
+ ggml_opt_callback callback,
+ void * callback_data) {
GGML_ASSERT(ggml_is_scalar(f));
// these will store the parameters we want to optimize
struct ggml_tensor * ps[GGML_MAX_PARAMS];
int np = 0;
- int nx = 0;
+ int64_t nx = 0;
for (int i = 0; i < gf->n_nodes; ++i) {
if (gf->nodes[i]->is_param) {
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -18444,31 +18421,32 @@ static enum ggml_opt_result ggml_opt_adam(
}
// constants
- const float sched = params.adam.sched;
- const float decay = params.adam.decay * sched;
- const float alpha = params.adam.alpha * sched;
+ float sched = params.adam.sched;
+ const float alpha = params.adam.alpha;
+ const float decay = params.adam.decay * alpha;
const float beta1 = params.adam.beta1;
const float beta2 = params.adam.beta2;
const float eps = params.adam.eps;
+ const float gclip = params.adam.gclip;
+ const int decay_min_ndim = params.adam.decay_min_ndim;
- float * x = opt->adam.x->data; // view of the parameters
- float * g1 = opt->adam.g1->data; // gradient
- float * g2 = opt->adam.g2->data; // gradient squared
float * m = opt->adam.m->data; // first moment
float * v = opt->adam.v->data; // second moment
- float * mh = opt->adam.mh->data; // first moment hat
- float * vh = opt->adam.vh->data; // second moment hat
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
- // update view
- ggml_opt_get_params(np, ps, x);
+ if (callback) {
+ callback(callback_data, &sched);
+ }
// compute the function value
ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f);
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+ ggml_graph_compute(gb, &cplan);
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
opt->adam.fx_best = opt->adam.fx_prev;
@@ -18476,6 +18454,9 @@ static enum ggml_opt_result ggml_opt_adam(
pf[opt->iter % params.past] = opt->adam.fx_prev;
}
+ opt->loss_before = opt->adam.fx_prev;
+ opt->loss_after = opt->adam.fx_prev;
+
// initialize
if (opt->just_initialized) {
opt->adam.n_no_improvement = 0;
@@ -18508,50 +18489,55 @@ static enum ggml_opt_result ggml_opt_adam(
UNUSED(t_start_cpu);
{
- // update the gradient
- ggml_opt_get_grad(np, ps, g1);
-
- // m_t = beta1*m_t-1 + (1 - beta1)*g_t
- ggml_vec_scale_f32(nx, m, beta1);
- ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1);
-
- // g2 = g1^2
- ggml_vec_sqr_f32 (nx, g2, g1);
-
- // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
- ggml_vec_scale_f32(nx, v, beta2);
- ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2);
-
- // m^hat = m_t / (1 - beta1^t)
- // v^hat = v_t / (1 - beta2^t)
- // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
- // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
- // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
- // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
- // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
- ggml_vec_cpy_f32 (nx, mh, m);
- ggml_vec_cpy_f32 (nx, vh, v);
-
- ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
- ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter)));
-
- ggml_vec_sqrt_f32 (nx, vh, vh);
- ggml_vec_acc1_f32 (nx, vh, eps);
-
- ggml_vec_div_f32 (nx, mh, mh, vh);
- ggml_vec_scale_f32(nx, x, 1.0f - decay);
- ggml_vec_sub_f32 (nx, x, x, mh);
+ float gnorm = 1.0f;
+ if (gclip > 0.0f) {
+ // gradient clipping
+ ggml_float sum = 0.0;
+ for (int p = 0; p < np; ++p) {
+ const int64_t ne = ggml_nelements(ps[p]);
+ for (int64_t j = 0; j < ne; ++j) {
+ float g = ggml_get_f32_1d(ps[p]->grad, j);
+ sum += (ggml_float)(g*g);
+ }
+ }
+ ggml_float norm = sqrt(sum);
+ if (norm > (ggml_float) gclip) {
+ gnorm = (float) ((ggml_float) gclip / norm);
+ }
+ }
+ const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
+ const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
+ int64_t i = 0;
+ for (int p = 0; p < np; ++p) {
+ const int64_t ne = ggml_nelements(ps[p]);
+ const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
+ for (int64_t j = 0; j < ne; ++j) {
+ float x = ggml_get_f32_1d(ps[p], j);
+ float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
+ m[i] = m[i]*beta1 + g*(1.0f - beta1);
+ v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
+ float mh = m[i]*beta1h;
+ float vh = v[i]*beta2h;
+ vh = sqrtf(vh) + eps;
+ x = x*(1.0f - p_decay) - mh/vh;
+ ggml_set_f32_1d(ps[p], j, x);
+ ++i;
+ }
+ }
+ }
- // update the parameters
- ggml_opt_set_params(np, ps, x);
+ if (callback) {
+ callback(callback_data, &sched);
}
ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f);
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+ ggml_graph_compute(gb, &cplan);
const float fx = ggml_get_f32_1d(f, 0);
+ opt->loss_after = fx;
+
// check convergence
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -18620,7 +18606,6 @@ struct ggml_lbfgs_iteration_data {
};
static enum ggml_opt_result linesearch_backtracking(
- struct ggml_context * ctx,
const struct ggml_opt_params * params,
int nx,
float * x,
@@ -18632,8 +18617,11 @@ static enum ggml_opt_result linesearch_backtracking(
struct ggml_tensor * f,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
+ struct ggml_cplan * cplan,
const int np,
- struct ggml_tensor * ps[]) {
+ struct ggml_tensor * ps[],
+ ggml_opt_callback callback,
+ void * callback_data) {
int count = 0;
float width = 0.0f;
@@ -18662,6 +18650,12 @@ static enum ggml_opt_result linesearch_backtracking(
dgtest = params->lbfgs.ftol*dginit;
while (true) {
+ if (callback) {
+ // LBFG-S does not support learning rate -> ignore learning schedule
+ float sched = 0;
+ callback(callback_data, &sched);
+ }
+
ggml_vec_cpy_f32(nx, x, xp);
ggml_vec_mad_f32(nx, x, d, *step);
@@ -18672,7 +18666,7 @@ static enum ggml_opt_result linesearch_backtracking(
ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f);
- ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
+ ggml_graph_compute(gb, cplan);
ggml_opt_get_grad(np, ps, g);
@@ -18732,7 +18726,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
struct ggml_opt_params params,
struct ggml_tensor * f,
struct ggml_cgraph * gf,
- struct ggml_cgraph * gb) {
+ struct ggml_cgraph * gb,
+ ggml_opt_callback callback,
+ void * callback_data) {
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -18764,6 +18760,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
opt->iter = iter;
}
+ struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
float * x = opt->lbfgs.x->data; // current parameters
float * xp = opt->lbfgs.xp->data; // previous parameters
float * g = opt->lbfgs.g->data; // current gradient
@@ -18785,6 +18785,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
float * lm_s = opt->lbfgs.lms->data;
float * lm_y = opt->lbfgs.lmy->data;
+ if (callback) {
+ // LBFG-S does not support learning rate -> ignore learning schedule
+ float sched = 0;
+ callback(callback_data, &sched);
+ }
+
// evaluate the function value and its gradient
{
ggml_opt_set_params(np, ps, x);
@@ -18792,11 +18798,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f);
- ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+ ggml_graph_compute(gb, &cplan);
ggml_opt_get_grad(np, ps, g);
fx = ggml_get_f32_1d(f, 0);
+
+ opt->loss_before = fx;
+ opt->loss_after = fx;
}
// search direction = -gradient
@@ -18851,7 +18860,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
ggml_vec_cpy_f32(nx, xp, x);
ggml_vec_cpy_f32(nx, gp, g);
- ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
+ ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
if (ls < 0) {
// linesearch failed - go back to the previous point and return
@@ -18861,6 +18870,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
return ls;
}
+ opt->loss_after = fx;
+
ggml_vec_norm_f32(nx, &xnorm, x);
ggml_vec_norm_f32(nx, &gnorm, g);
@@ -18918,7 +18929,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
// ys = y^t \cdot s -> 1 / \rho.
// yy = y^t \cdot y.
//
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
+ ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
lm_ys[end[0]] = ys;
@@ -18981,13 +18992,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
.adam = {
.n_iter = 10000,
.sched = 1.000f,
- .decay = 0.001f,
+ .decay = 0.0f,
+ .decay_min_ndim = 2,
.alpha = 0.001f,
.beta1 = 0.9f,
.beta2 = 0.999f,
.eps = 1e-8f,
.eps_f = 1e-5f,
.eps_g = 1e-3f,
+ .gclip = 0.0f,
},
};
} break;
@@ -19037,23 +19050,13 @@ GGML_API void ggml_opt_init(
switch (opt->params.type) {
case GGML_OPT_ADAM:
{
- opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
- opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
- opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
- opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
- opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
opt->adam.pf = params.past > 0
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
: NULL;
- ggml_set_zero(opt->adam.x);
- ggml_set_zero(opt->adam.g1);
- ggml_set_zero(opt->adam.g2);
ggml_set_zero(opt->adam.m);
ggml_set_zero(opt->adam.v);
- ggml_set_zero(opt->adam.mh);
- ggml_set_zero(opt->adam.vh);
if (opt->adam.pf) {
ggml_set_zero(opt->adam.pf);
}
@@ -19137,7 +19140,7 @@ enum ggml_opt_result ggml_opt_resume(
*gf = ggml_build_forward (f);
*gb = ggml_build_backward(ctx, gf, true);
- return ggml_opt_resume_g(ctx, opt, f, gf, gb);
+ return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
}
enum ggml_opt_result ggml_opt_resume_g(
@@ -19145,7 +19148,9 @@ enum ggml_opt_result ggml_opt_resume_g(
struct ggml_opt_context * opt,
struct ggml_tensor * f,
struct ggml_cgraph * gf,
- struct ggml_cgraph * gb) {
+ struct ggml_cgraph * gb,
+ ggml_opt_callback callback,
+ void * callback_data) {
// build forward + backward compute graphs
enum ggml_opt_result result = GGML_OPT_OK;
@@ -19153,11 +19158,11 @@ enum ggml_opt_result ggml_opt_resume_g(
switch (opt->params.type) {
case GGML_OPT_ADAM:
{
- result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
+ result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
} break;
case GGML_OPT_LBFGS:
{
- result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
+ result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
} break;
}
@@ -19612,7 +19617,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the kv pairs
{
- ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+ ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
@@ -19695,7 +19700,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the tensor infos
{
- ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+ ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19896,7 +19901,7 @@ void gguf_free(struct gguf_context * ctx) {
}
}
- GGML_ALIGNED_FREE(ctx->kv);
+ free(ctx->kv);
}
if (ctx->infos) {
@@ -19908,7 +19913,7 @@ void gguf_free(struct gguf_context * ctx) {
}
}
- GGML_ALIGNED_FREE(ctx->infos);
+ free(ctx->infos);
}
GGML_ALIGNED_FREE(ctx);