diff options
author | Junyang Lin <justinlin930319@hotmail.com> | 2024-04-24 15:16:21 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-24 10:16:21 +0300 |
commit | 3fec68be4e9577fc53158366d3b3af039c17bb1f (patch) | |
tree | 72db2fe074c20955d1c5e28fc8a87dcce0ef30af | |
parent | c8297c6af5693555652c40b95974b95d49d2674d (diff) |
convert : add support of codeqwen due to tokenizer (#6707)
* add support of codeqwen due to tokenizer
* override load_hparams
* fix typo
* fix load_params
* convert : fix whitespace
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
-rwxr-xr-x | convert-hf-to-gguf.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4ace13eb..5763b666 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -363,6 +363,16 @@ class Model(ABC): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + print( + f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" + ) + for i in range(1, pad_count + 1): + tokens.append(f"[PAD{i}]") + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + assert len(tokens) == vocab_size self.gguf_writer.add_tokenizer_model("llama") @@ -1789,6 +1799,12 @@ class QwenModel(Model): class Qwen2Model(Model): model_arch = gguf.MODEL_ARCH.QWEN2 + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): |