diff options
Diffstat (limited to 'src/llama.cpp')
-rw-r--r-- | src/llama.cpp | 109 |
1 files changed, 104 insertions, 5 deletions
diff --git a/src/llama.cpp b/src/llama.cpp index 03139e41..a459cb00 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8194,7 +8194,8 @@ static bool llm_load_tensors( auto wk_b_f32_t = ggml_cont(ctx, wk_b_f32_tview); wk_b_f32_t->data = (char *)wk_b_f32->data + ggml_nbytes(wk_b_f32); - auto new_type = ggml_is_quantized(wkv_b.type) ? GGML_TYPE_Q8_0 : wkv_b.type; + auto new_type = ggml_is_quantized(wkv_b.type) ? + wkv_b.type >= GGML_TYPE_Q4_0_R8 && wkv_b.type <= GGML_TYPE_Q8_K_R8 ? GGML_TYPE_Q8_0_R8 : GGML_TYPE_Q8_0 : wkv_b.type; auto wk_b = ggml_cast(ctx, wk_b_f32_t, new_type); wk_b->data = (char *)wk_b_f32_t->data + ggml_nbytes(wk_b_f32_t); @@ -8218,6 +8219,9 @@ static bool llm_load_tensors( ggml_set_name(l.computed_wk_b.get(), name.c_str()); ggml_backend_buffer_set_usage(l.computed_wk_b->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); ggml_backend_tensor_set(l.computed_wk_b.get(), wk_b->data, 0, ggml_nbytes(wk_b)); + if (ggml_backend_buffer_is_host(l.computed_wk_b->buffer)) { + iqk_modify_tensor(l.computed_wk_b.get()); + } l.wk_b = l.computed_wk_b.get(); @@ -8243,6 +8247,9 @@ static bool llm_load_tensors( ggml_set_name(l.computed_wv_b.get(), name.c_str()); ggml_backend_buffer_set_usage(l.computed_wv_b->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); ggml_backend_tensor_set(l.computed_wv_b.get(), wv_b->data, 0, ggml_nbytes(wv_b)); + if (ggml_backend_buffer_is_host(l.computed_wv_b->buffer)) { + iqk_modify_tensor(l.computed_wv_b.get()); + } l.wv_b = l.computed_wv_b.get(); @@ -17140,11 +17147,48 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa return new_size; } +static llama_ftype repacked_ftype(llama_ftype ftype) { + static std::unordered_map<llama_ftype, llama_ftype> k_map = { + { LLAMA_FTYPE_MOSTLY_Q4_0, LLAMA_FTYPE_MOSTLY_Q4_0_R8 }, + { LLAMA_FTYPE_MOSTLY_Q8_0, LLAMA_FTYPE_MOSTLY_Q8_0_R8 }, + { LLAMA_FTYPE_MOSTLY_Q5_0, LLAMA_FTYPE_MOSTLY_Q5_0_R4 }, + { LLAMA_FTYPE_MOSTLY_Q2_K, LLAMA_FTYPE_MOSTLY_Q2_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q3_K_S, LLAMA_FTYPE_MOSTLY_Q3_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q3_K_M, LLAMA_FTYPE_MOSTLY_Q3_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q3_K_L, LLAMA_FTYPE_MOSTLY_Q3_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q4_K_S, LLAMA_FTYPE_MOSTLY_Q4_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q4_K_M, LLAMA_FTYPE_MOSTLY_Q4_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q5_K_S, LLAMA_FTYPE_MOSTLY_Q5_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q5_K_M, LLAMA_FTYPE_MOSTLY_Q5_K_R4 }, + { LLAMA_FTYPE_MOSTLY_Q6_K, LLAMA_FTYPE_MOSTLY_Q6_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ2_XXS, LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ2_XS, LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ3_XXS, LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ1_S, LLAMA_FTYPE_MOSTLY_IQ1_S_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ4_NL, LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ3_S, LLAMA_FTYPE_MOSTLY_IQ3_S_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ2_M, LLAMA_FTYPE_MOSTLY_IQ2_M_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ4_XS, LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 }, + { LLAMA_FTYPE_MOSTLY_IQ1_M, LLAMA_FTYPE_MOSTLY_IQ1_M_R4 }, + { LLAMA_FTYPE_MOSTLY_Q6_0, LLAMA_FTYPE_MOSTLY_Q6_0_R4 }, + { LLAMA_FTYPE_MOSTLY_BF16, LLAMA_FTYPE_MOSTLY_BF16_R16 }, + { LLAMA_FTYPE_MOSTLY_IQ2_BN, LLAMA_FTYPE_MOSTLY_IQ2_BN_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ2_K, LLAMA_FTYPE_MOSTLY_IQ2_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ3_K, LLAMA_FTYPE_MOSTLY_IQ3_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ4_K, LLAMA_FTYPE_MOSTLY_IQ4_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ5_K, LLAMA_FTYPE_MOSTLY_IQ5_K_R4 }, + { LLAMA_FTYPE_MOSTLY_IQ4_KS, LLAMA_FTYPE_MOSTLY_IQ4_KS_R4 }, + { LLAMA_FTYPE_MOSTLY_Q8_KV, LLAMA_FTYPE_MOSTLY_Q8_KV_R8 }, + }; + if (auto it = k_map.find(ftype); it != k_map.end()) return it->second; + return ftype; +} + static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; - switch (params->ftype) { + switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; @@ -17256,7 +17300,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ftype = model.ftype; } const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr; - if (params->imatrix) { + if (!params->only_repack && params->imatrix) { imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix); if (imatrix_data) { LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); @@ -17278,7 +17322,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // copy the KV pairs from the input file gguf_set_kv (ctx_out, ml.meta); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV - gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV // Remove split metadata gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); @@ -17303,9 +17346,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } + bool is_repacked = ml.ftype >= LLAMA_FTYPE_MOSTLY_Q4_0_R8 && ml.ftype <= LLAMA_FTYPE_MOSTLY_Q8_K_R8; + int n_to_repack = 0, n_to_modify = 0; for (int i = 0; i < ml.n_tensors; ++i) { const struct ggml_tensor * meta = ml.get_tensor_meta(i); + if (params->only_repack) { + auto repacked_type = (ggml_type)iqk_repacked_type(meta); + if (repacked_type != meta->type) { + ++n_to_repack; + } else if (!is_repacked) { + if (iqk_should_modify_tensor(meta)) ++n_to_modify; + } + } + const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -17317,6 +17371,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } + if (params->only_repack) { + if (n_to_repack == 0 && n_to_modify == 0) { + printf("=========================== %s: nothing to do for only_repack option\n", __func__); + return; + } + ftype = repacked_ftype(model.ftype); + printf("===================== Model ftype: %s: Repacked ftype: %s\n", llama_model_ftype_name(model.ftype).c_str(), + llama_model_ftype_name(ftype).c_str()); + } + + gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // sanity checks @@ -17457,6 +17523,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s void * new_data; size_t new_size; + if (params->only_repack) { + ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor); + bool modify = !is_repacked && iqk_should_modify_tensor(tensor); + if (modify || repacked_type != tensor->type) { + new_type = repacked_type; + new_size = ggml_nbytes(tensor); + if ((int)work.size() < new_size) work.resize(new_size); + new_data = work.data(); + + auto aux_tensor = *tensor; + aux_tensor.data = work.data(); + std::memcpy(aux_tensor.data, tensor->data, new_size); + + if (repacked_type != tensor->type) { + iqk_repack_tensor(&aux_tensor); + GGML_ASSERT(aux_tensor.type == repacked_type); + } else { + bool did_modify = iqk_modify_tensor(&aux_tensor); + GGML_ASSERT(did_modify); + } + } + else { + new_type = tensor->type; + new_size = ggml_nbytes(tensor); + new_data = tensor->data; + } + LLAMA_LOG_INFO("size = %8.3f MB, type = %s\n", new_size/1024.0/1024.0, ggml_type_name(new_type)); + goto QuantizationDone; + } + if (quantize) { new_type = default_type; if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) { @@ -17562,7 +17658,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_IQ1_S_R4|| new_type == GGML_TYPE_IQ1_M_R4|| (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || - (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) { + (new_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); @@ -17727,6 +17823,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); } + +QuantizationDone:; total_size_org += ggml_nbytes(tensor); total_size_new += new_size; @@ -18051,6 +18149,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.pure =*/ false, /*.keep_split =*/ false, /*.ignore_imatrix_rules =*/ false, + /*.only_repack =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.custom_quants =*/ nullptr, |