summaryrefslogtreecommitdiff
path: root/convert-hf-to-gguf.py
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2023-12-13 13:04:25 +0100
committerGitHub <noreply@github.com>2023-12-13 14:04:25 +0200
commit799a1cb13b0b1b560ab0ceff485caed68faa8f1f (patch)
tree18073e9ffe1f7b69ca584a9cd2cc039e79e6dc63 /convert-hf-to-gguf.py
parentfecac45658a99eddc4d6e36ba0310ca8f87a77f0 (diff)
llama : add Mixtral support (#4406)
* convert : support Mixtral as LLAMA arch * convert : fix n_ff typo * llama : model loading * ggml : sync latest ggml_mul_mat_id * llama : update graph to support MoE * llama : fix cur -> cur_expert * llama : first working version * llama : fix expert weighting in the FFN * ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only) * ggml : add n_as argument to ggml_mul_mat_id * ggml : fix ggml_get_rows to take into account ne02 / ne11 * metal : add more general support for ggml_get_rows + tests * llama : add basic support for offloading moe with CUDA * metal : add/mul/div use general kernel when src1 not cont * metal : reduce the kernel launches for ggml_mul_mat_id * ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D * ggml : update get_rows f16 and q * cuda : support non-contiguous src1 in get_rows * llama : offload missing ffn_moe_silu * metal : fix ggml_get_rows to work with non-cont src1 * metal : add indirect mat-vec kernels for all quantization types * llama : do not quantize expert gating tensors * llama : add n_expert and n_expert_used to hparams + change quants * test-backend-ops : add moe test * cuda : fix get_rows when ncols is odd * convert : determine n_ctx correctly * metal : fix ggml_mul_mat_id for F32 * test-backend-ops : make experts more evenly probable (test_moe) * test-backend-ops : cleanup, add moe test for batches * test-backend-ops : add cpy from f32 -> all types test * test-backend-ops : fix dequantize block offset * llama : fix hard-coded number of experts * test-backend-ops : simplify and disable slow tests to avoid CI timeout * test-backend-ops : disable MOE test with thread sanitizer * cuda : fix mul_mat_id with multi gpu * convert : use 1e6 rope_freq_base for mixtral * convert : fix style * convert : support safetensors format * gguf-py : bump version * metal : add cpy f16 -> f32 kernel * metal : fix binary ops for ne10 % 4 != 0 * test-backend-ops : add one more sum_rows test * ggml : do not use BLAS with ggml_mul_mat_id * convert-hf : support for mixtral-instruct (#4428) * convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct * convert : use sentencepiece tokenizer for Mixtral-instruct * convert : make flake8 happy * metal : fix soft_max kernels ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92 * metal : limit kernels to not use more than the allowed threads --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Radek Pilar <github@mrkva.eu>
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-xconvert-hf-to-gguf.py21
1 files changed, 20 insertions, 1 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index bced1f56..e46a7813 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -77,8 +77,18 @@ class Model:
self.gguf_writer.add_embedding_length(n_embd)
if (n_ff := self.hparams.get("intermediate_size")) is not None:
self.gguf_writer.add_feed_forward_length(n_ff)
- if (n_head := self.hparams.get("num_attention_head")) is not None:
+ if (n_head := self.hparams.get("num_attention_heads")) is not None:
self.gguf_writer.add_head_count(n_head)
+ if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
+ self.gguf_writer.add_head_count_kv(n_head_kv)
+
+ if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
+ self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
+ if (n_experts := self.hparams.get("num_local_experts")) is not None:
+ self.gguf_writer.add_expert_count(n_experts)
+ if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+ self.gguf_writer.add_expert_used_count(n_experts_used)
+
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
def write_tensors(self):
@@ -170,6 +180,8 @@ class Model:
return StableLMModel
if model_architecture == "QWenLMHeadModel":
return QwenModel
+ if model_architecture == "MixtralForCausalLM":
+ return MixtralModel
return Model
def _is_model_safetensors(self) -> bool:
@@ -207,6 +219,8 @@ class Model:
return gguf.MODEL_ARCH.STABLELM
if arch == "QWenLMHeadModel":
return gguf.MODEL_ARCH.QWEN
+ if arch == "MixtralForCausalLM":
+ return gguf.MODEL_ARCH.LLAMA
raise NotImplementedError(f'Architecture "{arch}" not supported!')
@@ -837,6 +851,11 @@ class StableLMModel(Model):
self.gguf_writer.add_layer_norm_eps(1e-5)
+class MixtralModel(Model):
+ def set_vocab(self):
+ self._set_vocab_sentencepiece()
+
+
class QwenModel(Model):
@staticmethod
def token_bytes_to_string(b):