summaryrefslogtreecommitdiff
path: root/gguf-py
diff options
context:
space:
mode:
authorDouglas Hanley <thesecretaryofwar@gmail.com>2024-02-11 10:21:38 -0600
committerGitHub <noreply@github.com>2024-02-11 11:21:38 -0500
commit2891c8aa9af17f4ff636ff3868bc34ff72b56e25 (patch)
tree1a037e8ad635aa54ddf8ab8cb39c04bb4f8cf141 /gguf-py
parent97a336507ed9b971d72262bec7e2b8b7016a054a (diff)
Add support for BERT embedding models (#5423)
* BERT model graph construction (build_bert) * WordPiece tokenizer (llm_tokenize_wpm) * Add flag for non-causal attention models * Allow for models that only output embeddings * Support conversion of BERT models to GGUF * Based on prior work by @xyzhang626 and @skeskinen --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'gguf-py')
-rw-r--r--gguf-py/gguf/constants.py43
-rw-r--r--gguf-py/gguf/gguf_writer.py6
-rw-r--r--gguf-py/gguf/tensor_mapping.py13
3 files changed, 41 insertions, 21 deletions
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 1cfd41c0..a9c13dd3 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -50,6 +50,7 @@ class Keys:
VALUE_LENGTH = "{arch}.attention.value_length"
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+ CAUSAL = "{arch}.attention.causal"
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
@@ -60,22 +61,23 @@ class Keys:
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
class Tokenizer:
- MODEL = "tokenizer.ggml.model"
- LIST = "tokenizer.ggml.tokens"
- TOKEN_TYPE = "tokenizer.ggml.token_type"
- SCORES = "tokenizer.ggml.scores"
- MERGES = "tokenizer.ggml.merges"
- BOS_ID = "tokenizer.ggml.bos_token_id"
- EOS_ID = "tokenizer.ggml.eos_token_id"
- UNK_ID = "tokenizer.ggml.unknown_token_id"
- SEP_ID = "tokenizer.ggml.seperator_token_id"
- PAD_ID = "tokenizer.ggml.padding_token_id"
- ADD_BOS = "tokenizer.ggml.add_bos_token"
- ADD_EOS = "tokenizer.ggml.add_eos_token"
- ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
- HF_JSON = "tokenizer.huggingface.json"
- RWKV = "tokenizer.rwkv.world"
- CHAT_TEMPLATE = "tokenizer.chat_template"
+ MODEL = "tokenizer.ggml.model"
+ LIST = "tokenizer.ggml.tokens"
+ TOKEN_TYPE = "tokenizer.ggml.token_type"
+ TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
+ SCORES = "tokenizer.ggml.scores"
+ MERGES = "tokenizer.ggml.merges"
+ BOS_ID = "tokenizer.ggml.bos_token_id"
+ EOS_ID = "tokenizer.ggml.eos_token_id"
+ UNK_ID = "tokenizer.ggml.unknown_token_id"
+ SEP_ID = "tokenizer.ggml.seperator_token_id"
+ PAD_ID = "tokenizer.ggml.padding_token_id"
+ ADD_BOS = "tokenizer.ggml.add_bos_token"
+ ADD_EOS = "tokenizer.ggml.add_eos_token"
+ ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
+ HF_JSON = "tokenizer.huggingface.json"
+ RWKV = "tokenizer.rwkv.world"
+ CHAT_TEMPLATE = "tokenizer.chat_template"
#
@@ -122,6 +124,7 @@ class MODEL_TENSOR(IntEnum):
ATTN_OUT = auto()
ATTN_NORM = auto()
ATTN_NORM_2 = auto()
+ ATTN_OUT_NORM = auto()
ATTN_ROT_EMBD = auto()
FFN_GATE_INP = auto()
FFN_NORM = auto()
@@ -134,6 +137,7 @@ class MODEL_TENSOR(IntEnum):
FFN_UP_EXP = auto()
ATTN_Q_NORM = auto()
ATTN_K_NORM = auto()
+ LAYER_OUT_NORM = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -178,6 +182,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
+ MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
@@ -187,6 +192,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
+ MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
}
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -262,17 +268,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
],
MODEL_ARCH.BERT: [
MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.TOKEN_TYPES,
MODEL_TENSOR.POS_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
- MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_OUT_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
- MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
+ MODEL_TENSOR.LAYER_OUT_NORM,
],
MODEL_ARCH.MPT: [
MODEL_TENSOR.TOKEN_EMBD,
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 16808196..7af58a46 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -357,6 +357,9 @@ class GGUFWriter:
def add_layer_norm_rms_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
+ def add_causal_attention(self, value: bool) -> None:
+ self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
+
def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
@@ -387,6 +390,9 @@ class GGUFWriter:
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
+ def add_token_type_count(self, value: int) -> None:
+ self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value)
+
def add_token_scores(self, scores: Sequence[float]) -> None:
self.add_array(Keys.Tokenizer.SCORES, scores)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 4f16d850..c7ba1420 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -30,6 +30,7 @@ class TensorNameMap:
# Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom
+ "embeddings.LayerNorm", # bert
),
# Position embeddings
@@ -54,7 +55,6 @@ class TensorNameMap:
"transformer.ln_f", # gpt2 gpt-j falcon
"model.norm", # llama-hf baichuan internlm2
"norm", # llama-pth
- "embeddings.LayerNorm", # bert
"transformer.norm_f", # mpt
"ln_f", # refact bloom qwen gpt2
"language_model.encoder.final_layernorm", # persimmon
@@ -79,7 +79,6 @@ class TensorNameMap:
"transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", # llama-hf
"layers.{bid}.attention_norm", # llama-pth
- "encoder.layer.{bid}.attention.output.LayerNorm", # bert
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi
"h.{bid}.ln_1", # gpt2
@@ -155,6 +154,11 @@ class TensorNameMap:
"model.layers.{bid}.attention.wo", # internlm2
),
+ # Attention output norm
+ MODEL_TENSOR.ATTN_OUT_NORM: (
+ "encoder.layer.{bid}.attention.output.LayerNorm", # bert
+ ),
+
# Rotary embeddings
MODEL_TENSOR.ATTN_ROT_EMBD: (
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
@@ -171,7 +175,6 @@ class TensorNameMap:
"transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf
"layers.{bid}.ffn_norm", # llama-pth
- "encoder.layer.{bid}.output.LayerNorm", # bert
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", # yi
"h.{bid}.ln_2", # gpt2
@@ -266,6 +269,10 @@ class TensorNameMap:
MODEL_TENSOR.ROPE_FREQS: (
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
),
+
+ MODEL_TENSOR.LAYER_OUT_NORM: (
+ "encoder.layer.{bid}.output.LayerNorm", # bert
+ )
}
mapping: dict[str, tuple[MODEL_TENSOR, str]]