main : add self-extend support (#4815)

* examples : add passkey test * passkey : better prints * passkey : select pass key pos from CLI * passkey : simplify n_past logic * llama : "self-extend"-like context extension * passkey : add comment * main : add Self-Extend support * llama : add comment about llama_kv_cache_seq_div
author: Georgi Gerganov <ggerganov@gmail.com> 2024-01-08 11:18:32 +0200
committer: GitHub <noreply@github.com> 2024-01-08 11:18:32 +0200
commit: 52531fdff88764282c1b233174721aab8347252d (patch)
tree: d4aeec20b4b634f5de3bd9839df507dee85c9e1f /common/common.h
parent: b0034d93ce2949ce7d9c098ca02e56f66cd484e2 (diff)
1 files changed, 2 insertions, 0 deletions
diff --git a/common/common.h b/common/common.h
index 9659aa04..e2bbfc25 100644
--- a/common/common.h
+++ b/common/common.h
@@ -62,6 +62,8 @@ struct gpt_params {
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
+    int32_t grp_attn_n                      = 1;     // group-attention factor
+    int32_t grp_attn_w                      = 512;   // group-attention width
     float   rope_freq_base                  = 0.0f;  // RoPE base frequency
     float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
     float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
author	Georgi Gerganov <ggerganov@gmail.com>	2024-01-08 11:18:32 +0200
committer	GitHub <noreply@github.com>	2024-01-08 11:18:32 +0200
commit	52531fdff88764282c1b233174721aab8347252d (patch)
tree	d4aeec20b4b634f5de3bd9839df507dee85c9e1f /common/common.h
parent	b0034d93ce2949ce7d9c098ca02e56f66cd484e2 (diff)