diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-03-26 13:09:30 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-26 14:09:30 +0200 |
commit | d25b1c31b07c3675443a55a828dd58cfef5a241c (patch) | |
tree | 3ea7b6c04accf513b7493a893cd95ae42e760e4b /llama.cpp | |
parent | deb7240100da99555b9ab9dc635021e591fceaf5 (diff) |
quantize : be able to override metadata by key (#6321)
* quantize: be able to override metadata by key
* minor : spacing
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 38 |
1 files changed, 28 insertions, 10 deletions
@@ -12776,7 +12776,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s constexpr bool use_mmap = false; #endif - llama_model_loader ml(fname_inp, use_mmap, NULL); + llama_model_kv_override * kv_overrides = nullptr; + if (params->kv_overrides) { + auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides; + kv_overrides = v->data(); + } + llama_model_loader ml(fname_inp, use_mmap, kv_overrides); ml.init_mappings(false); // no prefetching? llama_model model; @@ -12805,6 +12810,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); + if (params->kv_overrides) { + const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides; + for (auto & o : overrides) { + if (o.key[0] == 0) break; + if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { + gguf_set_val_f32(ctx_out, o.key, o.float_value); + } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { + gguf_set_val_i32(ctx_out, o.key, o.int_value); + } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { + gguf_set_val_bool(ctx_out, o.key, o.bool_value); + } else { + LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); + } + } + } + for (int i = 0; i < ml.n_tensors; ++i) { const struct ggml_tensor * meta = ml.get_tensor_meta(i); @@ -12813,21 +12834,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // TODO: avoid hardcoded tensor names - use the TN_* constants if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { ++qs.n_attention_wv; - } - else if (name.find("ffn_down") != std::string::npos) { + } else if (name.find("ffn_down") != std::string::npos) { ++qs.n_ffn_down; - } - else if (name.find("ffn_gate") != std::string::npos) { + } else if (name.find("ffn_gate") != std::string::npos) { ++qs.n_ffn_gate; - } - else if (name.find("ffn_up") != std::string::npos) { + } else if (name.find("ffn_up") != std::string::npos) { ++qs.n_ffn_up; - } - else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { + } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; } } - if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { + if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) { LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer); } @@ -13363,6 +13380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.only_copy =*/ false, /*.pure =*/ false, /*.imatrix =*/ nullptr, + /*.kv_overrides =*/ nullptr, }; return result; |