summaryrefslogtreecommitdiff
path: root/convert-hf-to-gguf.py
diff options
context:
space:
mode:
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-xconvert-hf-to-gguf.py131
1 files changed, 130 insertions, 1 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 53ce76c7..bced1f56 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -10,7 +10,7 @@ import re
import sys
from enum import IntEnum
from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
import numpy as np
import torch
@@ -168,6 +168,8 @@ class Model:
return PersimmonModel
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return StableLMModel
+ if model_architecture == "QWenLMHeadModel":
+ return QwenModel
return Model
def _is_model_safetensors(self) -> bool:
@@ -203,6 +205,8 @@ class Model:
return gguf.MODEL_ARCH.PERSIMMON
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return gguf.MODEL_ARCH.STABLELM
+ if arch == "QWenLMHeadModel":
+ return gguf.MODEL_ARCH.QWEN
raise NotImplementedError(f'Architecture "{arch}" not supported!')
@@ -832,6 +836,131 @@ class StableLMModel(Model):
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
self.gguf_writer.add_layer_norm_eps(1e-5)
+
+class QwenModel(Model):
+ @staticmethod
+ def token_bytes_to_string(b):
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+ byte_encoder = bytes_to_unicode()
+ return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
+
+ @staticmethod
+ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
+ parts = [bytes([b]) for b in token]
+ while True:
+ min_idx = None
+ min_rank = None
+ for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+ rank = mergeable_ranks.get(pair[0] + pair[1])
+ if rank is not None and (min_rank is None or rank < min_rank):
+ min_idx = i
+ min_rank = rank
+ if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+ break
+ assert min_idx is not None
+ parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
+ return parts
+
+ def set_vocab(self):
+ dir_model = self.dir_model
+ hparams = self.hparams
+ tokens: list[bytearray] = []
+ toktypes: list[int] = []
+
+ from transformers import AutoTokenizer # type: ignore[attr-defined]
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+ vocab_size = hparams["vocab_size"]
+ assert max(tokenizer.get_vocab().values()) < vocab_size
+
+ merges = []
+ vocab = {}
+ mergeable_ranks = tokenizer.mergeable_ranks
+ for token, rank in mergeable_ranks.items():
+ vocab[self.token_bytes_to_string(token)] = rank
+ if len(token) == 1:
+ continue
+ merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+ assert len(merged) == 2
+ merges.append(' '.join(map(self.token_bytes_to_string, merged)))
+
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
+ added_vocab = tokenizer.special_tokens
+
+ for i in range(vocab_size):
+ if i not in reverse_vocab:
+ pad_token = f"[PAD{i}]".encode("utf-8")
+ tokens.append(bytearray(pad_token))
+ toktypes.append(gguf.TokenType.USER_DEFINED)
+ elif reverse_vocab[i] in added_vocab:
+ tokens.append(reverse_vocab[i])
+ toktypes.append(gguf.TokenType.CONTROL)
+ else:
+ tokens.append(reverse_vocab[i])
+ toktypes.append(gguf.TokenType.NORMAL)
+
+ self.gguf_writer.add_tokenizer_model("gpt2")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_types(toktypes)
+
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+ special_vocab.merges = merges
+ special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
+ special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
+ special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+ def set_gguf_parameters(self):
+ self.gguf_writer.add_name("Qwen")
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+ self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
+ self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+ self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
+ self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+
+ def write_tensors(self):
+ block_count = self.hparams["num_hidden_layers"]
+ model_kv = dict(self.get_tensors())
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+ for name, data_torch in model_kv.items():
+ # we don't need these
+ if name.endswith(".rotary_emb.inv_freq"):
+ continue
+
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ data = data_torch.squeeze().numpy()
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+ self.gguf_writer.add_tensor(new_name, data)
+
###### CONVERSION LOGIC ######