diff options
author | Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> | 2024-02-25 10:54:04 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-25 11:54:04 +0200 |
commit | 69917dfa55674c608360638bb4d6a12a315e2810 (patch) | |
tree | 76bf37a8e692bd3109787ee045350abba93b48a8 /convert-hf-to-gguf.py | |
parent | 9e359a4f47c1b2dceb99e29706c9f7403d32ab5e (diff) |
py : fix StableLM conversion after config.json changes (#5703)
* Fix issues during StableLM models conversion
* Fix hard coded layer_norm_eps
* Support layer_norm_eps for LlavaStableLM
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
* Add missing parenthesis
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
* Support rotary_factor for LlavaStableLM
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
* fix typo
* Add StableLMEpochForCausalLM for safety
Co-authored-by: compilade <113953597+compilade@users.noreply.github.com>
* Add StableLMEpochForCausalLM for safety 2
Co-authored-by: compilade <113953597+compilade@users.noreply.github.com>
---------
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: compilade <113953597+compilade@users.noreply.github.com>
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-x | convert-hf-to-gguf.py | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 32d54b45..ae30b2a7 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -192,7 +192,7 @@ class Model: return RefactModel if model_architecture == "PersimmonForCausalLM": return PersimmonModel - if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return StableLMModel if model_architecture == "QWenLMHeadModel": return QwenModel @@ -253,7 +253,7 @@ class Model: return gguf.MODEL_ARCH.REFACT if arch == "PersimmonForCausalLM": return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": return gguf.MODEL_ARCH.QWEN @@ -1074,10 +1074,11 @@ class StableLMModel(Model): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) class MixtralModel(Model): |