diff options
author | slaren <slarengh@gmail.com> | 2023-12-13 13:04:25 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-13 14:04:25 +0200 |
commit | 799a1cb13b0b1b560ab0ceff485caed68faa8f1f (patch) | |
tree | 18073e9ffe1f7b69ca584a9cd2cc039e79e6dc63 /convert-hf-to-gguf.py | |
parent | fecac45658a99eddc4d6e36ba0310ca8f87a77f0 (diff) |
llama : add Mixtral support (#4406)
* convert : support Mixtral as LLAMA arch
* convert : fix n_ff typo
* llama : model loading
* ggml : sync latest ggml_mul_mat_id
* llama : update graph to support MoE
* llama : fix cur -> cur_expert
* llama : first working version
* llama : fix expert weighting in the FFN
* ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only)
* ggml : add n_as argument to ggml_mul_mat_id
* ggml : fix ggml_get_rows to take into account ne02 / ne11
* metal : add more general support for ggml_get_rows + tests
* llama : add basic support for offloading moe with CUDA
* metal : add/mul/div use general kernel when src1 not cont
* metal : reduce the kernel launches for ggml_mul_mat_id
* ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D
* ggml : update get_rows f16 and q
* cuda : support non-contiguous src1 in get_rows
* llama : offload missing ffn_moe_silu
* metal : fix ggml_get_rows to work with non-cont src1
* metal : add indirect mat-vec kernels for all quantization types
* llama : do not quantize expert gating tensors
* llama : add n_expert and n_expert_used to hparams + change quants
* test-backend-ops : add moe test
* cuda : fix get_rows when ncols is odd
* convert : determine n_ctx correctly
* metal : fix ggml_mul_mat_id for F32
* test-backend-ops : make experts more evenly probable (test_moe)
* test-backend-ops : cleanup, add moe test for batches
* test-backend-ops : add cpy from f32 -> all types test
* test-backend-ops : fix dequantize block offset
* llama : fix hard-coded number of experts
* test-backend-ops : simplify and disable slow tests to avoid CI timeout
* test-backend-ops : disable MOE test with thread sanitizer
* cuda : fix mul_mat_id with multi gpu
* convert : use 1e6 rope_freq_base for mixtral
* convert : fix style
* convert : support safetensors format
* gguf-py : bump version
* metal : add cpy f16 -> f32 kernel
* metal : fix binary ops for ne10 % 4 != 0
* test-backend-ops : add one more sum_rows test
* ggml : do not use BLAS with ggml_mul_mat_id
* convert-hf : support for mixtral-instruct (#4428)
* convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct
* convert : use sentencepiece tokenizer for Mixtral-instruct
* convert : make flake8 happy
* metal : fix soft_max kernels
ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92
* metal : limit kernels to not use more than the allowed threads
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Radek Pilar <github@mrkva.eu>
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-x | convert-hf-to-gguf.py | 21 |
1 files changed, 20 insertions, 1 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index bced1f56..e46a7813 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -77,8 +77,18 @@ class Model: self.gguf_writer.add_embedding_length(n_embd) if (n_ff := self.hparams.get("intermediate_size")) is not None: self.gguf_writer.add_feed_forward_length(n_ff) - if (n_head := self.hparams.get("num_attention_head")) is not None: + if (n_head := self.hparams.get("num_attention_heads")) is not None: self.gguf_writer.add_head_count(n_head) + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + + if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) def write_tensors(self): @@ -170,6 +180,8 @@ class Model: return StableLMModel if model_architecture == "QWenLMHeadModel": return QwenModel + if model_architecture == "MixtralForCausalLM": + return MixtralModel return Model def _is_model_safetensors(self) -> bool: @@ -207,6 +219,8 @@ class Model: return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": return gguf.MODEL_ARCH.QWEN + if arch == "MixtralForCausalLM": + return gguf.MODEL_ARCH.LLAMA raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -837,6 +851,11 @@ class StableLMModel(Model): self.gguf_writer.add_layer_norm_eps(1e-5) +class MixtralModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + class QwenModel(Model): @staticmethod def token_bytes_to_string(b): |