summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-12-21 23:20:49 +0200
committerGitHub <noreply@github.com>2023-12-21 23:20:49 +0200
commitafefa319f1f59b002dfa0d1ef407a2c74bd9770b (patch)
treea6923e0a6214293d88957cd11e25943f2c0fb80a /llama.cpp
parent769a7bc85eaa44e3d7eadf39abfeff7bb0b9cc2f (diff)
ggml : change ggml_scale to take a float instead of tensor (#4573)
* ggml : change ggml_scale to take a float instead of tensor * ggml : fix CPU implementation * tests : fix test-grad0 ggml-ci
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp119
1 files changed, 17 insertions, 102 deletions
diff --git a/llama.cpp b/llama.cpp
index ba970ce8..d6c19244 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4032,13 +4032,12 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * wo,
struct ggml_tensor * wo_b,
struct ggml_tensor * q_cur,
- struct ggml_tensor * kq_scale,
struct ggml_tensor * kq_mask,
int64_t n_ctx,
int32_t n_tokens,
int32_t n_kv,
float max_alibi_bias,
- float scale,
+ float kq_scale,
const llm_build_cb & cb,
int il) {
const int64_t n_embd = hparams.n_embd;
@@ -4086,7 +4085,7 @@ static struct ggml_tensor * llm_build_kqv(
kq = ggml_soft_max(ctx, kq);
cb(kq, "kq_soft_max", il);
} else {
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, scale);
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
cb(kq, "kq_soft_max_ext", il);
}
@@ -4231,10 +4230,6 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -4295,7 +4290,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4416,10 +4411,6 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -4478,7 +4469,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4536,10 +4527,6 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -4602,7 +4589,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4659,10 +4646,6 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -4702,7 +4685,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4759,10 +4742,6 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -4911,7 +4890,7 @@ struct llm_build_context {
// TODO: not tested, could be broken
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo,
- Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -4965,10 +4944,6 @@ struct llm_build_context {
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
cb(inpL, "inp_embd", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -5002,7 +4977,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5056,10 +5031,6 @@ struct llm_build_context {
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
cb(inpL, "inp_embd", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -5099,7 +5070,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5150,10 +5121,6 @@ struct llm_build_context {
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
cb(inpL, "inp_embd", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -5193,7 +5160,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5253,10 +5220,6 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -5306,7 +5269,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5366,10 +5329,6 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -5423,7 +5382,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@@ -5482,14 +5441,6 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
cb(inp_pos, "inp_pos", -1);
- // Q_scale
- struct ggml_tensor * Q_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(Q_scale, "Q_scale", -1);
-
- // KQ_scale
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
- cb(KQ_scale, "KQ_scale", -1);
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
cb(KQ_mask, "KQ_mask", -1);
@@ -5531,7 +5482,9 @@ struct llm_build_context {
);
cb(Qcur, "Qcur", il);
- Qcur = ggml_scale(ctx0, Qcur, Q_scale);
+ // with phi2, we scale the Q to avoid precision issues
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom(
@@ -5544,7 +5497,7 @@ struct llm_build_context {
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo,
- Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
cb(cur, "kqv_out", il);
}
@@ -5681,8 +5634,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "pos_embd", OFFLOAD_FUNC_NR },
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
- { "Q_scale", OFFLOAD_FUNC_NOP },
- { "KQ_scale", OFFLOAD_FUNC_NOP },
{ "KQ_mask", OFFLOAD_FUNC_FRC },
{ "K_shift", OFFLOAD_FUNC_FRC },
@@ -5784,8 +5735,6 @@ static struct ggml_cgraph * llama_build_graph(
bool alloc_inp_tokens = false;
bool alloc_inp_embd = false;
bool alloc_inp_pos = false;
- bool alloc_inp_Q_scale = false;
- bool alloc_inp_KQ_scale = false;
bool alloc_inp_KQ_mask = false;
bool alloc_inp_K_shift = false;
@@ -5849,37 +5798,6 @@ static struct ggml_cgraph * llama_build_graph(
alloc_inp_pos = true;
}
- if (!alloc_inp_Q_scale && strcmp(name, "Q_scale") == 0) {
- ggml_allocr_alloc(lctx.alloc, cur);
-
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- const int64_t n_embd_head = model.hparams.n_embd_head();
- float f = 1.0f/sqrtf(float(n_embd_head));
- ggml_backend_tensor_set(cur, &f, 0, sizeof(f));
- }
-
- alloc_inp_Q_scale = true;
- }
-
- if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
- ggml_allocr_alloc(lctx.alloc, cur);
-
- if (!ggml_allocr_is_measure(lctx.alloc)) {
- const int64_t n_embd_head = model.hparams.n_embd_head();
- float f;
- if (model.arch == LLM_ARCH_PHI2) {
- // with phi2, we scale the Q to avoid precision issues
- // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
- f = 1.0f;
- } else {
- f = 1.0f/sqrtf(float(n_embd_head));
- }
- ggml_backend_tensor_set(cur, &f, 0, sizeof(f));
- }
-
- alloc_inp_KQ_scale = true;
- }
-
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
ggml_allocr_alloc(lctx.alloc, cur);
@@ -9054,10 +8972,7 @@ static int llama_apply_lora_from_file_internal(
ggml_set_name(BA, "BA");
if (scaling != 1.0f) {
- ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling);
- ggml_set_name(scale_tensor, "scale_tensor");
-
- BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor);
+ BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
offload_func(BA);
ggml_set_name(BA, "BA_scaled");
}