summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-12-01 10:51:24 +0200
committerGitHub <noreply@github.com>2023-12-01 10:51:24 +0200
commitef47ec18da469423c276b683dd9b5741cee7023e (patch)
treeec3b4780dbe8f629425de499b298e8eadfd1aa4d /llama.cpp
parent1d144112c0fbbb4ecc07dbcf4f05a380148bd6de (diff)
ggml : add ggml_soft_max_ext (#4256)
* metal : implement soft_max_ext * cuda : implement soft_max_ext * ggml : implement soft_max_ext (CPU) * batched-bench : print threads ggml-ci * metal : simplify soft_max encoding ggml-ci * cuda : use 512 threads for soft_max instead of 32 * ggml : update soft max cpu * cuda : do warp-based block reduce * cuda : increase max block size to 1024 * cuda : fix warp reduction initialization of shared mem * metal : warp-based reduction for soft max kernel * metal : warp-based reduce for rms_norm * metal : simplify soft max kernel ggml-ci * alloc : fix build with debug
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp33
1 files changed, 20 insertions, 13 deletions
diff --git a/llama.cpp b/llama.cpp
index 1e00ea4a..e74fd723 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3704,22 +3704,28 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
- kq = ggml_scale(ctx, kq, kq_scale);
- cb(kq, "kq_scaled", il);
-
if (max_alibi_bias > 0.0f) {
- // TODO: n_head or n_head_kv
- // TODO: K-shift is likely not working
- // TODO: change to ggml_add
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
- cb(kq, "kq_scaled_alibi", il);
- }
+ // temporary branch until we figure out how to handle ggml_alibi through ggml_add
+ kq = ggml_scale(ctx, kq, kq_scale);
+ cb(kq, "kq_scaled", il);
+
+ if (max_alibi_bias > 0.0f) {
+ // TODO: n_head or n_head_kv
+ // TODO: K-shift is likely not working
+ // TODO: change to ggml_add
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
+ cb(kq, "kq_scaled_alibi", il);
+ }
- kq = ggml_add(ctx, kq, kq_mask);
- cb(kq, "kq_masked", il);
+ kq = ggml_add(ctx, kq, kq_mask);
+ cb(kq, "kq_masked", il);
- kq = ggml_soft_max(ctx, kq);
- cb(kq, "kq_soft_max", il);
+ kq = ggml_soft_max(ctx, kq);
+ cb(kq, "kq_soft_max", il);
+ } else {
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
+ cb(kq, "kq_soft_max_ext", il);
+ }
// split cached v into n_head heads
struct ggml_tensor * v =
@@ -5041,6 +5047,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
{ "kq_masked", OFFLOAD_FUNC_KQ },
{ "kq_soft_max", OFFLOAD_FUNC_V },
+ { "kq_soft_max_ext", OFFLOAD_FUNC_V },
{ "v", OFFLOAD_FUNC_V },
{ "kqv", OFFLOAD_FUNC_V },
{ "kqv_merged", OFFLOAD_FUNC_V },