gpt2 : Add gpt2 architecture integration (#4555)

author: manikbhandari <mbbhandarimanik2@gmail.com> 2023-12-28 09:03:57 -0500
committer: GitHub <noreply@github.com> 2023-12-28 15:03:57 +0100
commit: ea5497df5d138c83b2b0ca70aefdc4b1175c1001 (patch)
tree: c66c6a7c29c8a60c5758ef534aec46bd792175a9 /gguf-py/gguf/tensor_mapping.py
parent: f6793491b5af6da75edad34d6f503ef86d31b09f (diff)
1 files changed, 9 insertions, 1 deletions
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 0b8f7041..80c1d544 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -17,6 +17,7 @@ class TensorNameMap:
             "tok_embeddings",                            # llama-pth
             "embeddings.word_embeddings",                # bert
             "language_model.embedding.word_embeddings",  # persimmon
+            "wte",                                       # gpt2
             "transformer.embd.wte",                      # phi2
         ),
 
@@ -34,6 +35,7 @@ class TensorNameMap:
         MODEL_TENSOR.POS_EMBD: (
             "transformer.wpe",                 # gpt2
             "embeddings.position_embeddings",  # bert
+            "wpe",                             # gpt2
         ),
 
         # Output
@@ -53,7 +55,7 @@ class TensorNameMap:
             "norm",                                    # llama-pth
             "embeddings.LayerNorm",                    # bert
             "transformer.norm_f",                      # mpt
-            "ln_f",                                    # refact bloom qwen
+            "ln_f",                                    # refact bloom qwen gpt2
             "language_model.encoder.final_layernorm",  # persimmon
             "lm_head.ln",                              # phi2
         ),
@@ -78,6 +80,7 @@ class TensorNameMap:
             "encoder.layer.{bid}.attention.output.LayerNorm",       # bert
             "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
             "model.layers.{bid}.ln1",                               # yi
+            "h.{bid}.ln_1",                                         # gpt2
             "transformer.h.{bid}.ln",                               # phi2
             "model.layers.layers.{bid}.norm",                       # plamo
         ),
@@ -95,6 +98,7 @@ class TensorNameMap:
             "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
             "h.{bid}.self_attention.query_key_value",                              # bloom
             "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
+            "h.{bid}.attn.c_attn",                                                 # gpt2
             "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
         ),
 
@@ -137,6 +141,7 @@ class TensorNameMap:
             "encoder.layer.{bid}.attention.output.dense",                # bert
             "transformer.h.{bid}.attn.out_proj",                         # gpt-j
             "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
+            "h.{bid}.attn.c_proj",                                       # gpt2
             "transformer.h.{bid}.mixer.out_proj",                        # phi2
             "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
         ),
@@ -159,6 +164,7 @@ class TensorNameMap:
             "encoder.layer.{bid}.output.LayerNorm",                          # bert
             "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
             "model.layers.{bid}.ln2",                                        # yi
+            "h.{bid}.ln_2",                                                  # gpt2
         ),
 
         MODEL_TENSOR.FFN_GATE_INP: (
@@ -179,6 +185,7 @@ class TensorNameMap:
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
             "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
             "transformer.h.{bid}.mlp.w1",                             # qwen
+            "h.{bid}.mlp.c_fc",                                       # gpt2
             "transformer.h.{bid}.mlp.fc1",                            # phi2
             "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
         ),
@@ -218,6 +225,7 @@ class TensorNameMap:
             "encoder.layer.{bid}.output.dense",                       # bert
             "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
             "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
+            "h.{bid}.mlp.c_proj",                                     # gpt2
             "transformer.h.{bid}.mlp.fc2",                            # phi2
             "model.layers.layers.{bid}.mlp.down_proj",                # plamo
         ),
author	manikbhandari <mbbhandarimanik2@gmail.com>	2023-12-28 09:03:57 -0500
committer	GitHub <noreply@github.com>	2023-12-28 15:03:57 +0100
commit	ea5497df5d138c83b2b0ca70aefdc4b1175c1001 (patch)
tree	c66c6a7c29c8a60c5758ef534aec46bd792175a9 /gguf-py/gguf/tensor_mapping.py
parent	f6793491b5af6da75edad34d6f503ef86d31b09f (diff)