diff options
author | Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> | 2025-05-24 10:49:10 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-05-24 11:49:10 +0300 |
commit | c7ecd4e23acb42f1150abf0b118e0a2c7b8dc959 (patch) | |
tree | 6c619eb2d01abd3435f53bb092209935b252c8bb | |
parent | a2c42f9985a96abc8b1b4104b0524ea4b2da9363 (diff) |
Legacy quants conversion schemes in convert_hf_to_gguf.py (#449)
* Legacy quants conversion schemes in convert_hf_to_gguf.py
This, notably in order to make smaller conversions to generate an iMatrix file.
`Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0.
`Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0.
Adapted from the following llama.cpp mainline PR : https://github.com/ggml-org/llama.cpp/pull/9022
Original author @chentyjpm
Also, 2 forgotten mentions of FTYPE IQ3_KL in llama.cpp file.
* forgotten IQ5_KS case mention
-rw-r--r--[-rwxr-xr-x] | convert_hf_to_gguf.py | 48 | ||||
-rw-r--r-- | ggml/src/ggml-cuda/mmvq.cu | 1 | ||||
-rw-r--r-- | src/llama.cpp | 7 |
3 files changed, 50 insertions, 6 deletions
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 966cfcd3..01c9e34f 100755..100644 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -306,6 +306,27 @@ class Model: ): data_qtype = gguf.GGMLQuantizationType.F32 + if data_qtype is False and any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_K, + ) + ): + if self.ftype in ( + gguf.LlamaFileType.MOSTLY_Q4_0, + gguf.LlamaFileType.MOSTLY_Q4_1, + ): + data_qtype = gguf.GGMLQuantizationType.Q5_0 + elif self.ftype in ( + gguf.LlamaFileType.MOSTLY_Q5_0, + gguf.LlamaFileType.MOSTLY_Q5_1, + # gguf.LlamaFileType.MOSTLY_Q6_0, + ): + data_qtype = gguf.GGMLQuantizationType.Q8_0 + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) if isinstance(data_qtype, bool): if self.ftype == gguf.LlamaFileType.ALL_F32: @@ -314,6 +335,16 @@ class Model: data_qtype = gguf.GGMLQuantizationType.F16 elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0: + data_qtype = gguf.GGMLQuantizationType.Q4_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1: + data_qtype = gguf.GGMLQuantizationType.Q4_1 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0: + data_qtype = gguf.GGMLQuantizationType.Q5_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1: + data_qtype = gguf.GGMLQuantizationType.Q5_1 + # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented? + # data_qtype = gguf.GGMLQuantizationType.Q6_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 else: @@ -387,6 +418,13 @@ class Model: logger.info("Set model quantization version") self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + logger.info("****************************************************************************************") + logger.info("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`") + logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0") + logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0") + logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.") + logger.info("****************************************************************************************") def write(self): self.prepare_tensors() @@ -3375,7 +3413,6 @@ class DeepseekV2Model(Model): if match and int(match.group(1)) >= block_count: return [] - # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] @@ -4076,8 +4113,8 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", @@ -4163,6 +4200,11 @@ def main() -> None: "f32": gguf.LlamaFileType.ALL_F32, "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q4_0": gguf.LlamaFileType.MOSTLY_Q4_0, + "q4_1": gguf.LlamaFileType.MOSTLY_Q4_1, + "q5_0": gguf.LlamaFileType.MOSTLY_Q5_0, + "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1, + # "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, "auto": gguf.LlamaFileType.GUESSED, } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 30a6a58b..89b74f4b 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -652,6 +652,7 @@ bool ggml_cuda_mmvq_type_supported(ggml_type src0_type) { case GGML_TYPE_IQ4_KSS: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ5_KS: case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ3_S: return true; diff --git a/src/llama.cpp b/src/llama.cpp index 48d7214d..0a620164 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18803,7 +18803,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) { new_type = !qs.has_output ? GGML_TYPE_IQ4_K_R4 : GGML_TYPE_Q5_K_R4; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4 || + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL || + ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS_R4) && !qs.has_output) { new_type = GGML_TYPE_IQ5_K; } @@ -19165,8 +19166,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_K || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ5_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ5_KS_R4 || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_R4 || - ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL || + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_R4|| ftype == LLAMA_FTYPE_MOSTLY_IQ4_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4) { |