summaryrefslogtreecommitdiff
path: root/llama.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.h')
-rw-r--r--llama.h18
1 files changed, 16 insertions, 2 deletions
diff --git a/llama.h b/llama.h
index 75fe391e..3f1becd7 100644
--- a/llama.h
+++ b/llama.h
@@ -106,6 +106,14 @@ extern "C" {
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
+ enum llama_rope_scaling_type {
+ LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+ LLAMA_ROPE_SCALING_NONE = 0,
+ LLAMA_ROPE_SCALING_LINEAR = 1,
+ LLAMA_ROPE_SCALING_YARN = 2,
+ LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
+ };
+
typedef struct llama_token_data {
llama_token id; // token id
float logit; // log-odds of the token
@@ -172,10 +180,16 @@ extern "C" {
uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
+ int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
- float rope_freq_base; // RoPE base frequency, 0 = from model
- float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
+ float rope_freq_base; // RoPE base frequency, 0 = from model
+ float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
+ float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
+ float yarn_attn_factor; // YaRN magnitude scaling factor
+ float yarn_beta_fast; // YaRN low correction dim
+ float yarn_beta_slow; // YaRN high correction dim
+ uint32_t yarn_orig_ctx; // YaRN original context size
// Keep the booleans together to avoid misalignment during copy-by-value.
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)