diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 60 |
1 files changed, 24 insertions, 36 deletions
@@ -929,23 +929,22 @@ static const size_t kB = 1024; static const size_t MB = kB*kB; static const size_t GB = kB*kB*kB; -// default hparams (LLaMA 7B) struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx_train = 2048; // the context size used during training - uint32_t n_ctx = 512; // the context size used during inference - uint32_t n_embd = 4096; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - uint32_t n_ff = 11008; - - float f_norm_eps = 1e-5; - float f_norm_rms_eps = 1e-5; - - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; + uint32_t n_vocab; + uint32_t n_ctx_train; // context size the model was trained on + uint32_t n_ctx; // context size used during inference + uint32_t n_embd; + uint32_t n_head; + uint32_t n_head_kv; + uint32_t n_layer; + uint32_t n_rot; + uint32_t n_ff; + + float f_norm_eps; + float f_norm_rms_eps; + + float rope_freq_base; + float rope_freq_scale; bool operator!=(const llama_hparams & other) const { return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT @@ -1076,7 +1075,7 @@ struct llama_model { std::string name = "n/a"; - llama_hparams hparams; + llama_hparams hparams = {}; llama_vocab vocab; struct ggml_tensor * tok_embeddings; @@ -1674,28 +1673,17 @@ static void llm_load_hparams( hparams.n_head_kv = hparams.n_head; GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); - // TODO: manually setting rope freq base and scale should override this - // FIXME: partial fix when the param specified is not the default value, but - // will not work for overriding the model value to the params default - - llama_context_params defaults = llama_context_default_params(); - - // rope_freq_base - { - float ropebase = 10000.0f; - GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) { - rope_freq_base = ropebase; - } + // rope_freq_base (optional) + if (rope_freq_base == 0.0f) { + rope_freq_base = 10000.0f; + GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); } // rope_freq_scale (inverse of the kv) is optional - { + if (rope_freq_scale == 0.0f) { float ropescale = 1.0f; GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) { - rope_freq_scale = 1.0f/ropescale; - } + rope_freq_scale = 1.0f/ropescale; } // sanity check for n_rot (optional) @@ -6188,8 +6176,8 @@ struct llama_context_params llama_context_default_params() { /*.n_gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, - /*.rope_freq_base =*/ 10000.0f, - /*.rope_freq_scale =*/ 1.0f, + /*.rope_freq_base =*/ 0.0f, + /*.rope_freq_scale =*/ 0.0f, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, |