diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-01-08 11:18:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-08 11:18:32 +0200 |
commit | 52531fdff88764282c1b233174721aab8347252d (patch) | |
tree | d4aeec20b4b634f5de3bd9839df507dee85c9e1f /common/common.h | |
parent | b0034d93ce2949ce7d9c098ca02e56f66cd484e2 (diff) |
main : add self-extend support (#4815)
* examples : add passkey test
* passkey : better prints
* passkey : select pass key pos from CLI
* passkey : simplify n_past logic
* llama : "self-extend"-like context extension
* passkey : add comment
* main : add Self-Extend support
* llama : add comment about llama_kv_cache_seq_div
Diffstat (limited to 'common/common.h')
-rw-r--r-- | common/common.h | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/common/common.h b/common/common.h index 9659aa04..e2bbfc25 100644 --- a/common/common.h +++ b/common/common.h @@ -62,6 +62,8 @@ struct gpt_params { int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. + int32_t grp_attn_n = 1; // group-attention factor + int32_t grp_attn_w = 512; // group-attention width float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor |