summaryrefslogtreecommitdiff
path: root/gguf-py/gguf/tensor_mapping.py
diff options
context:
space:
mode:
authorfairydreaming <166155368+fairydreaming@users.noreply.github.com>2024-05-28 17:07:05 +0200
committerGitHub <noreply@github.com>2024-05-28 17:07:05 +0200
commitee3dff6b8e39bb8c1cdea1782a7b95ef0118f970 (patch)
tree28eaea501c6c929f98442cc451eb14b380c02de6 /gguf-py/gguf/tensor_mapping.py
parentedc29433fa08b4e5aeb67649a29fc7713af13d04 (diff)
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160 * common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture * common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier * convert-hf : add model conversion support for DeepseekV2ForCausalLM * llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models * llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor) * llama : add inference support for LLM_ARCH_DEEPSEEK2 --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
Diffstat (limited to 'gguf-py/gguf/tensor_mapping.py')
-rw-r--r--gguf-py/gguf/tensor_mapping.py29
1 files changed, 28 insertions, 1 deletions
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 8b1b21d7..83e3c4c3 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -256,6 +256,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_UP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
+ "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
),
# AWQ-activation gate
@@ -285,6 +286,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
+ "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
),
# Feed-forward down
@@ -320,6 +322,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_DOWN_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
+ "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
),
MODEL_TENSOR.ATTN_Q_NORM: (
@@ -383,6 +386,30 @@ class TensorNameMap:
"model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj",
),
+
+ MODEL_TENSOR.ATTN_Q_A: (
+ "model.layers.{bid}.self_attn.q_a_proj", # deepseek2
+ ),
+
+ MODEL_TENSOR.ATTN_Q_B: (
+ "model.layers.{bid}.self_attn.q_b_proj", # deepseek2
+ ),
+
+ MODEL_TENSOR.ATTN_KV_A_MQA: (
+ "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
+ ),
+
+ MODEL_TENSOR.ATTN_KV_B: (
+ "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
+ ),
+
+ MODEL_TENSOR.ATTN_Q_A_NORM: (
+ "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
+ ),
+
+ MODEL_TENSOR.ATTN_KV_A_NORM: (
+ "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
+ ),
}
# architecture-specific block mappings
@@ -415,7 +442,7 @@ class TensorNameMap:
if tensor not in MODEL_TENSORS[arch]:
continue
# TODO: make this configurable
- n_experts = 128
+ n_experts = 160
for xid in range(n_experts):
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
self.mapping[tensor_name] = (tensor, tensor_name)