diff options
Diffstat (limited to 'gguf-py')
-rw-r--r-- | gguf-py/gguf/constants.py | 25 | ||||
-rw-r--r-- | gguf-py/gguf/tensor_mapping.py | 19 |
2 files changed, 43 insertions, 1 deletions
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 67e23dcc..c9ae259e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum): COMMAND_R = auto() DBRX = auto() OLMO = auto() + ARCTIC = auto() class MODEL_TENSOR(IntEnum): @@ -167,6 +168,7 @@ class MODEL_TENSOR(IntEnum): FFN_DOWN = auto() FFN_UP = auto() FFN_ACT = auto() + FFN_NORM_EXP = auto() FFN_GATE_EXP = auto() FFN_DOWN_EXP = auto() FFN_UP_EXP = auto() @@ -218,6 +220,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.ARCTIC: "arctic", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -251,6 +254,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", @@ -732,6 +736,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.ARCTIC: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_NORM_EXP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 8e1cac91..8b1b21d7 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -244,6 +244,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc11", # nomic-bert "model.layers.{bid}.mlp.c_fc", # starcoder2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 + "model.layers.{bid}.residual_mlp.w3", # arctic ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -272,6 +273,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc12", # nomic-bert "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 "transformer.h.{bid}.mlp.linear_1", # refact + "model.layers.{bid}.residual_mlp.w1", # arctic ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -306,6 +308,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc2", # nomic-bert "model.layers.{bid}.mlp.c_proj", # starcoder2 "encoder.layer.{bid}.mlp.wo", # jina-bert-v2 + "model.layers.{bid}.residual_mlp.w2", # arctic ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -382,6 +385,18 @@ class TensorNameMap: ), } + # architecture-specific block mappings + arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { + MODEL_ARCH.ARCTIC: { + MODEL_TENSOR.FFN_NORM: ( + "model.layers.{bid}.residual_layernorm", + ), + MODEL_TENSOR.FFN_NORM_EXP: ( + "model.layers.{bid}.post_attention_layernorm", + ), + }, + } + mapping: dict[str, tuple[MODEL_TENSOR, str]] def __init__(self, arch: MODEL_ARCH, n_blocks: int): @@ -393,12 +408,14 @@ class TensorNameMap: self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: self.mapping[key] = (tensor, tensor_name) + if arch in self.arch_block_mappings_cfg: + self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch]) for bid in range(n_blocks): for tensor, keys in self.block_mappings_cfg.items(): if tensor not in MODEL_TENSORS[arch]: continue # TODO: make this configurable - n_experts = 60 + n_experts = 128 for xid in range(n_experts): tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) self.mapping[tensor_name] = (tensor, tensor_name) |