summaryrefslogtreecommitdiff
path: root/gguf-py/gguf/tensor_mapping.py
diff options
context:
space:
mode:
authorShijie <821898965@qq.com>2024-04-16 23:40:48 +0800
committerGitHub <noreply@github.com>2024-04-16 18:40:48 +0300
commitf4dea7da1841a92d2788b0535063abf2f0e28461 (patch)
treec7a729d974e4315c71c78eea84fa08dda920b649 /gguf-py/gguf/tensor_mapping.py
parent8a56075b07a8b571bf95a912ffdce4c928c2b414 (diff)
llama : add qwen2moe (#6074)
* support qwen2moe * fix-review * metal : support unary ops for nelements % 4 != 0 * metal : require contiguousness for float4 unary kernels * metal : require contiguousness for float4 unary kernels (cont) * fix-review * names : for brevity "SHARED_EXP" -> "SHEXP" * llama : reuse build_moe_ffn() * llama : add model type name --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'gguf-py/gguf/tensor_mapping.py')
-rw-r--r--gguf-py/gguf/tensor_mapping.py34
1 files changed, 27 insertions, 7 deletions
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index ec6fcbb8..10de36fa 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -208,10 +208,15 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
+ "model.layers.{bid}.mlp.gate", # qwen2moe
"transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
),
+ MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
+ "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
+ ),
+
# Feed-forward up
MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
@@ -236,9 +241,14 @@ class TensorNameMap:
),
MODEL_TENSOR.FFN_UP_EXP: (
- "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
- "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
- "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
+ "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
+ "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
+ "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
+ "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
+ ),
+
+ MODEL_TENSOR.FFN_UP_SHEXP: (
+ "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
),
# AWQ-activation gate
@@ -260,6 +270,11 @@ class TensorNameMap:
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
+ "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
+ ),
+
+ MODEL_TENSOR.FFN_GATE_SHEXP: (
+ "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
),
# Feed-forward down
@@ -285,9 +300,14 @@ class TensorNameMap:
),
MODEL_TENSOR.FFN_DOWN_EXP: (
- "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
- "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
- "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
+ "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
+ "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
+ "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
+ "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
+ ),
+
+ MODEL_TENSOR.FFN_DOWN_SHEXP: (
+ "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
),
MODEL_TENSOR.ATTN_Q_NORM: (
@@ -366,7 +386,7 @@ class TensorNameMap:
if tensor not in MODEL_TENSORS[arch]:
continue
# TODO: make this configurable
- n_experts = 8
+ n_experts = 60
for xid in range(n_experts):
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
self.mapping[tensor_name] = (tensor, tensor_name)