summaryrefslogtreecommitdiff
path: root/convert_hf_to_gguf.py
diff options
context:
space:
mode:
Diffstat (limited to 'convert_hf_to_gguf.py')
-rwxr-xr-xconvert_hf_to_gguf.py372
1 files changed, 318 insertions, 54 deletions
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 7a74cc20..b470a088 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -251,12 +251,7 @@ class Model:
return [(self.map_tensor_name(name), data_torch)]
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
- del name, new_name, bid, n_dims # unused
-
- return False
-
- def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
del name, new_name, bid, n_dims # unused
return False
@@ -285,54 +280,46 @@ class Model:
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
data: np.ndarray # type hint
n_dims = len(data.shape)
- data_dtype = data.dtype
- data_qtype: gguf.GGMLQuantizationType | None = None
-
- # when both are True, f32 should win
- extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
- extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
+ data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
- # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
- extra_f32 = any(cond for cond in (
- extra_f32,
- n_dims == 1,
- new_name.endswith("_norm.weight"),
- ))
+ if n_dims <= 1 or new_name.endswith("_norm.weight"):
+ data_qtype = gguf.GGMLQuantizationType.F32
+ # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
# Some tensor types are always in float32
- extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
- gguf.MODEL_TENSOR.FFN_GATE_INP,
- gguf.MODEL_TENSOR.POS_EMBD,
- gguf.MODEL_TENSOR.TOKEN_TYPES,
- ))
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- extra_f16 = any(cond for cond in (
- extra_f16,
- (name.endswith(".weight") and n_dims >= 2),
- ))
-
- if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
- if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
- data = gguf.quantize_bf16(data)
- assert data.dtype == np.int16
- data_qtype = gguf.GGMLQuantizationType.BF16
-
- elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
- data = gguf.quantize_q8_0(data)
- assert data.dtype == np.uint8
- data_qtype = gguf.GGMLQuantizationType.Q8_0
+ if data_qtype is False and (
+ any(
+ self.match_model_tensor_name(new_name, key, bid)
+ for key in (
+ gguf.MODEL_TENSOR.FFN_GATE_INP,
+ gguf.MODEL_TENSOR.POS_EMBD,
+ gguf.MODEL_TENSOR.TOKEN_TYPES,
+ )
+ )
+ or not name.endswith(".weight")
+ ):
+ data_qtype = gguf.GGMLQuantizationType.F32
- else: # default to float16 for quantized tensors
- if data_dtype != np.float16:
- data = data.astype(np.float16)
+ # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
+ if isinstance(data_qtype, bool):
+ if self.ftype == gguf.LlamaFileType.ALL_F32:
+ data_qtype = gguf.GGMLQuantizationType.F32
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
data_qtype = gguf.GGMLQuantizationType.F16
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+ data_qtype = gguf.GGMLQuantizationType.BF16
+ elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
+ data_qtype = gguf.GGMLQuantizationType.Q8_0
+ else:
+ raise ValueError(f"Unknown file type: {self.ftype.name}")
- if data_qtype is None: # by default, convert to float32
- if data_dtype != np.float32:
- data = data.astype(np.float32)
- data_qtype = gguf.GGMLQuantizationType.F32
+ try:
+ data = gguf.quants.quantize(data, data_qtype)
+ except gguf.QuantError as e:
+ logger.warning("%s, %s", e, "falling back to F16")
+ data_qtype = gguf.GGMLQuantizationType.F16
+ data = gguf.quants.quantize(data, data_qtype)
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
@@ -1570,6 +1557,34 @@ class LlamaModel(Model):
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
+ base = self.hparams.get("rope_theta", 10000.0)
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+ factor = rope_scaling.get("factor", 8.0)
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+
+ low_freq_wavelen = old_context_len / low_freq_factor
+ high_freq_wavelen = old_context_len / high_freq_factor
+ assert low_freq_wavelen != high_freq_wavelen
+
+ rope_factors = []
+ for freq in freqs:
+ wavelen = 2 * math.pi / freq
+ if wavelen < high_freq_wavelen:
+ rope_factors.append(1)
+ elif wavelen > low_freq_wavelen:
+ rope_factors.append(factor)
+ else:
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+ self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+
super().prepare_tensors()
if self._experts is not None:
@@ -1754,7 +1769,7 @@ class DbrxModel(Model):
return [(new_name, data_torch)]
- def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
del name, new_name, bid # unused
return n_dims > 1
@@ -2495,6 +2510,112 @@ class NomicBertModel(BertModel):
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
+@Model.register("XLMRobertaModel")
+class XLMRobertaModel(BertModel):
+ model_arch = gguf.MODEL_ARCH.BERT
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # we need the pad_token_id to know how to chop down position_embd matrix
+ if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
+ self._position_offset = 1 + pad_token_id
+ if "max_position_embeddings" in self.hparams:
+ self.hparams["max_position_embeddings"] -= self._position_offset
+ else:
+ self._position_offset = None
+
+ def set_vocab(self):
+ # to avoid TypeError: Descriptors cannot be created directly
+ # exception when importing sentencepiece_model_pb2
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+ from sentencepiece import SentencePieceProcessor
+ from sentencepiece import sentencepiece_model_pb2 as model
+
+ tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
+ if not tokenizer_path.is_file():
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
+
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
+
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+
+ tokenizer = SentencePieceProcessor()
+ tokenizer.LoadFromFile(str(tokenizer_path))
+
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+ scores: list[float] = [-10000.0] * vocab_size
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+ for token_id in range(tokenizer.vocab_size()):
+ piece = tokenizer.IdToPiece(token_id)
+ text = piece.encode("utf-8")
+ score = tokenizer.GetScore(token_id)
+
+ toktype = SentencePieceTokenTypes.NORMAL
+ if tokenizer.IsUnknown(token_id):
+ toktype = SentencePieceTokenTypes.UNKNOWN
+ elif tokenizer.IsControl(token_id):
+ toktype = SentencePieceTokenTypes.CONTROL
+ elif tokenizer.IsUnused(token_id):
+ toktype = SentencePieceTokenTypes.UNUSED
+ elif tokenizer.IsByte(token_id):
+ toktype = SentencePieceTokenTypes.BYTE
+
+ tokens[token_id] = text
+ scores[token_id] = score
+ toktypes[token_id] = toktype
+
+ if vocab_size > len(tokens):
+ pad_count = vocab_size - len(tokens)
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
+ for i in range(1, pad_count + 1):
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
+ scores.append(-1000.0)
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
+
+ # realign tokens (see HF tokenizer code)
+ tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
+ scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
+ toktypes = [
+ SentencePieceTokenTypes.CONTROL,
+ SentencePieceTokenTypes.CONTROL,
+ SentencePieceTokenTypes.CONTROL,
+ SentencePieceTokenTypes.UNKNOWN,
+ ] + toktypes[3:-1]
+
+ self.gguf_writer.add_tokenizer_model("t5")
+ self.gguf_writer.add_tokenizer_pre("default")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_scores(scores)
+ self.gguf_writer.add_token_types(toktypes)
+ self.gguf_writer.add_add_space_prefix(add_prefix)
+ self.gguf_writer.add_token_type_count(1)
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
+ if precompiled_charsmap:
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
+
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+ self.gguf_writer.add_add_bos_token(True)
+ self.gguf_writer.add_add_eos_token(True)
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
+ if name == "embeddings.position_embeddings.weight":
+ if self._position_offset is not None:
+ data_torch = data_torch[self._position_offset:,:]
+
+ return super().modify_tensors(data_torch, name, bid)
+
+
@Model.register("GemmaForCausalLM")
class GemmaModel(Model):
model_arch = gguf.MODEL_ARCH.GEMMA
@@ -2669,18 +2790,22 @@ class MambaModel(Model):
return [(new_name, data_torch)]
- def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
- del n_dims # unused
-
- return bid is not None and new_name in (
- self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+ if bid is not None and new_name in (
+ self.format_tensor_name(
+ n, bid, ".weight" if name.endswith(".weight") else ""
+ )
+ for n in [
gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.SSM_X,
gguf.MODEL_TENSOR.SSM_DT,
gguf.MODEL_TENSOR.SSM_A,
gguf.MODEL_TENSOR.SSM_D,
]
- )
+ ):
+ return gguf.GGMLQuantizationType.F32
+
+ return super().tensor_force_quant(name, new_name, bid, n_dims)
@Model.register("CohereForCausalLM")
@@ -3216,6 +3341,145 @@ class T5Model(Model):
return [(self.map_tensor_name(name), data_torch)]
+@Model.register("T5EncoderModel")
+class T5EncoderModel(Model):
+ model_arch = gguf.MODEL_ARCH.T5ENCODER
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.shared_token_embeddings_found = False
+
+ def set_vocab(self):
+ # to avoid TypeError: Descriptors cannot be created directly
+ # exception when importing sentencepiece_model_pb2
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+ from sentencepiece import SentencePieceProcessor
+ from sentencepiece import sentencepiece_model_pb2 as model
+
+ tokenizer_path = self.dir_model / 'tokenizer.model'
+
+ # many older models use spiece.model tokenizer model filename
+ if not tokenizer_path.is_file():
+ tokenizer_path = self.dir_model / 'spiece.model'
+
+ if not tokenizer_path.is_file():
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
+
+ sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
+ sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+
+ # some models like Pile-T5 family use BPE tokenizer instead of Unigram
+ if sentencepiece_model.trainer_spec.model_type == 2: # BPE
+ # assure the tokenizer model file name is correct
+ assert tokenizer_path.name == 'tokenizer.model'
+ return self._set_vocab_sentencepiece()
+ else:
+ assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
+
+ add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+ remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
+ precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+
+ tokenizer = SentencePieceProcessor()
+ tokenizer.LoadFromFile(str(tokenizer_path))
+
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+ scores: list[float] = [-10000.0] * vocab_size
+ toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+ for token_id in range(tokenizer.vocab_size()):
+ piece = tokenizer.IdToPiece(token_id)
+ text = piece.encode("utf-8")
+ score = tokenizer.GetScore(token_id)
+
+ toktype = SentencePieceTokenTypes.NORMAL
+ if tokenizer.IsUnknown(token_id):
+ toktype = SentencePieceTokenTypes.UNKNOWN
+ elif tokenizer.IsControl(token_id):
+ toktype = SentencePieceTokenTypes.CONTROL
+ elif tokenizer.IsUnused(token_id):
+ toktype = SentencePieceTokenTypes.UNUSED
+ elif tokenizer.IsByte(token_id):
+ toktype = SentencePieceTokenTypes.BYTE
+
+ tokens[token_id] = text
+ scores[token_id] = score
+ toktypes[token_id] = toktype
+
+ added_tokens_file = self.dir_model / 'added_tokens.json'
+ if added_tokens_file.is_file():
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
+ added_tokens_json = json.load(f)
+ for key in added_tokens_json:
+ token_id = added_tokens_json[key]
+ if token_id >= vocab_size:
+ logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+ continue
+
+ tokens[token_id] = key.encode("utf-8")
+ scores[token_id] = -1000.0
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+ if vocab_size > len(tokens):
+ pad_count = vocab_size - len(tokens)
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
+ for i in range(1, pad_count + 1):
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
+ scores.append(-1000.0)
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
+
+ self.gguf_writer.add_tokenizer_model("t5")
+ self.gguf_writer.add_tokenizer_pre("default")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_scores(scores)
+ self.gguf_writer.add_token_types(toktypes)
+ self.gguf_writer.add_add_space_prefix(add_prefix)
+ self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
+ if precompiled_charsmap:
+ self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
+
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+ self.gguf_writer.add_add_bos_token(False)
+ self.gguf_writer.add_add_eos_token(True)
+
+ def set_gguf_parameters(self):
+ if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
+ logger.warning("Couldn't find context length in config.json, assuming default value of 512")
+ n_ctx = 512
+ self.gguf_writer.add_context_length(n_ctx)
+ self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+ self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
+ self.gguf_writer.add_block_count(self.hparams["num_layers"])
+ self.gguf_writer.add_head_count(self.hparams["num_heads"])
+ self.gguf_writer.add_key_length(self.hparams["d_kv"])
+ self.gguf_writer.add_value_length(self.hparams["d_kv"])
+ self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+ self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+ self.gguf_writer.add_file_type(self.ftype)
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ del bid # unused
+
+ # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
+ # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
+ # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
+ # and decoder and ignore the remaining ones.
+ if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
+ if not self.shared_token_embeddings_found:
+ name = "shared.weight"
+ self.shared_token_embeddings_found = True
+ else:
+ logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
+ return []
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+
@Model.register("JAISLMHeadModel")
class JaisModel(Model):
model_arch = gguf.MODEL_ARCH.JAIS