diff options
author | Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> | 2025-06-03 08:30:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-06-03 09:30:30 +0300 |
commit | 4f8b05a0d76e6c5e47fe1f6c7bd079e0fe95dbba (patch) | |
tree | d744e55678ff6ab4d743d71e53156d1e512cef41 /convert_hf_to_gguf.py | |
parent | 7a8abe29f745cff95896095bf19cf247bdf2c661 (diff) |
convert_hf_to_gguf.py : conversion from hf weights to Q6_0 (#483)
* Direct conversion from fp16 to Q6_0
* forgotten comma
* More precise infos
Diffstat (limited to 'convert_hf_to_gguf.py')
-rw-r--r-- | convert_hf_to_gguf.py | 26 |
1 files changed, 13 insertions, 13 deletions
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 01c9e34f..1754e486 100644 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -313,6 +313,7 @@ class Model: gguf.MODEL_TENSOR.OUTPUT, gguf.MODEL_TENSOR.ATTN_V, gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_QKV, ) ): if self.ftype in ( @@ -323,9 +324,8 @@ class Model: elif self.ftype in ( gguf.LlamaFileType.MOSTLY_Q5_0, gguf.LlamaFileType.MOSTLY_Q5_1, - # gguf.LlamaFileType.MOSTLY_Q6_0, ): - data_qtype = gguf.GGMLQuantizationType.Q8_0 + data_qtype = gguf.GGMLQuantizationType.Q6_0 # No override (data_qtype is False), or wants to be quantized (data_qtype is True) if isinstance(data_qtype, bool): @@ -343,8 +343,8 @@ class Model: data_qtype = gguf.GGMLQuantizationType.Q5_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1: data_qtype = gguf.GGMLQuantizationType.Q5_1 - # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented? - # data_qtype = gguf.GGMLQuantizationType.Q6_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: + data_qtype = gguf.GGMLQuantizationType.Q6_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 else: @@ -419,12 +419,12 @@ class Model: logger.info("Set model quantization version") self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - logger.info("****************************************************************************************") - logger.info("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`") - logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0") - logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0") - logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.") - logger.info("****************************************************************************************") + logger.info("***********************************************************************************************") + logger.info("** Converting to `q4_0`,`q4_1`,`q5_0`, `q5_1` or `q6_0` is not equiv to using `llama-quantize`!") + logger.info("** Ftype `q4_0`,`q4_1` are here converting embeddings, output, attn_k and attn_v/qkv in q5_0.") + logger.info("** Ftype `q5_0`,`q5_1` are here converting embeddings, output, attn_k and attn_v/qkv in q6_0.") + logger.info("** This, in order to create a small but viable conv. to then for example make an iMatrix file.") + logger.info("***********************************************************************************************") def write(self): self.prepare_tensors() @@ -4113,8 +4113,8 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q6_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1, q6_0 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", @@ -4204,7 +4204,7 @@ def main() -> None: "q4_1": gguf.LlamaFileType.MOSTLY_Q4_1, "q5_0": gguf.LlamaFileType.MOSTLY_Q5_0, "q5_1": gguf.LlamaFileType.MOSTLY_Q5_1, - # "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0, + "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, "auto": gguf.LlamaFileType.GUESSED, } |