summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp33
1 files changed, 20 insertions, 13 deletions
diff --git a/llama.cpp b/llama.cpp
index 1e00ea4a..e74fd723 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3704,22 +3704,28 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
- kq = ggml_scale(ctx, kq, kq_scale);
- cb(kq, "kq_scaled", il);
-
if (max_alibi_bias > 0.0f) {
- // TODO: n_head or n_head_kv
- // TODO: K-shift is likely not working
- // TODO: change to ggml_add
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
- cb(kq, "kq_scaled_alibi", il);
- }
+ // temporary branch until we figure out how to handle ggml_alibi through ggml_add
+ kq = ggml_scale(ctx, kq, kq_scale);
+ cb(kq, "kq_scaled", il);
+
+ if (max_alibi_bias > 0.0f) {
+ // TODO: n_head or n_head_kv
+ // TODO: K-shift is likely not working
+ // TODO: change to ggml_add
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
+ cb(kq, "kq_scaled_alibi", il);
+ }
- kq = ggml_add(ctx, kq, kq_mask);
- cb(kq, "kq_masked", il);
+ kq = ggml_add(ctx, kq, kq_mask);
+ cb(kq, "kq_masked", il);
- kq = ggml_soft_max(ctx, kq);
- cb(kq, "kq_soft_max", il);
+ kq = ggml_soft_max(ctx, kq);
+ cb(kq, "kq_soft_max", il);
+ } else {
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
+ cb(kq, "kq_soft_max_ext", il);
+ }
// split cached v into n_head heads
struct ggml_tensor * v =
@@ -5041,6 +5047,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
{ "kq_masked", OFFLOAD_FUNC_KQ },
{ "kq_soft_max", OFFLOAD_FUNC_V },
+ { "kq_soft_max_ext", OFFLOAD_FUNC_V },
{ "v", OFFLOAD_FUNC_V },
{ "kqv", OFFLOAD_FUNC_V },
{ "kqv_merged", OFFLOAD_FUNC_V },