summaryrefslogtreecommitdiff
path: root/gguf-py
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2023-12-13 13:04:25 +0100
committerGitHub <noreply@github.com>2023-12-13 14:04:25 +0200
commit799a1cb13b0b1b560ab0ceff485caed68faa8f1f (patch)
tree18073e9ffe1f7b69ca584a9cd2cc039e79e6dc63 /gguf-py
parentfecac45658a99eddc4d6e36ba0310ca8f87a77f0 (diff)
llama : add Mixtral support (#4406)
* convert : support Mixtral as LLAMA arch * convert : fix n_ff typo * llama : model loading * ggml : sync latest ggml_mul_mat_id * llama : update graph to support MoE * llama : fix cur -> cur_expert * llama : first working version * llama : fix expert weighting in the FFN * ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only) * ggml : add n_as argument to ggml_mul_mat_id * ggml : fix ggml_get_rows to take into account ne02 / ne11 * metal : add more general support for ggml_get_rows + tests * llama : add basic support for offloading moe with CUDA * metal : add/mul/div use general kernel when src1 not cont * metal : reduce the kernel launches for ggml_mul_mat_id * ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D * ggml : update get_rows f16 and q * cuda : support non-contiguous src1 in get_rows * llama : offload missing ffn_moe_silu * metal : fix ggml_get_rows to work with non-cont src1 * metal : add indirect mat-vec kernels for all quantization types * llama : do not quantize expert gating tensors * llama : add n_expert and n_expert_used to hparams + change quants * test-backend-ops : add moe test * cuda : fix get_rows when ncols is odd * convert : determine n_ctx correctly * metal : fix ggml_mul_mat_id for F32 * test-backend-ops : make experts more evenly probable (test_moe) * test-backend-ops : cleanup, add moe test for batches * test-backend-ops : add cpy from f32 -> all types test * test-backend-ops : fix dequantize block offset * llama : fix hard-coded number of experts * test-backend-ops : simplify and disable slow tests to avoid CI timeout * test-backend-ops : disable MOE test with thread sanitizer * cuda : fix mul_mat_id with multi gpu * convert : use 1e6 rope_freq_base for mixtral * convert : fix style * convert : support safetensors format * gguf-py : bump version * metal : add cpy f16 -> f32 kernel * metal : fix binary ops for ne10 % 4 != 0 * test-backend-ops : add one more sum_rows test * ggml : do not use BLAS with ggml_mul_mat_id * convert-hf : support for mixtral-instruct (#4428) * convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct * convert : use sentencepiece tokenizer for Mixtral-instruct * convert : make flake8 happy * metal : fix soft_max kernels ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92 * metal : limit kernels to not use more than the allowed threads --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Radek Pilar <github@mrkva.eu>
Diffstat (limited to 'gguf-py')
-rw-r--r--gguf-py/gguf/constants.py16
-rw-r--r--gguf-py/gguf/gguf_writer.py6
-rw-r--r--gguf-py/gguf/tensor_mapping.py39
-rw-r--r--gguf-py/pyproject.toml2
4 files changed, 53 insertions, 10 deletions
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 685c88f1..12133882 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -38,6 +38,8 @@ class Keys:
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
+ EXPERT_COUNT = "{arch}.expert_count"
+ EXPERT_USED_COUNT = "{arch}.expert_used_count"
class Attention:
HEAD_COUNT = "{arch}.attention.head_count"
@@ -111,10 +113,14 @@ class MODEL_TENSOR(IntEnum):
ATTN_NORM = auto()
ATTN_NORM_2 = auto()
ATTN_ROT_EMBD = auto()
+ FFN_GATE_INP = auto()
+ FFN_NORM = auto()
FFN_GATE = auto()
FFN_DOWN = auto()
FFN_UP = auto()
- FFN_NORM = auto()
+ FFN_GATE_EXP = auto()
+ FFN_DOWN_EXP = auto()
+ FFN_UP_EXP = auto()
ATTN_Q_NORM = auto()
ATTN_K_NORM = auto()
@@ -154,10 +160,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
+ MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
+ MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
+ MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
+ MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
}
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -172,10 +182,14 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
+ MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.GPTNEOX: [
MODEL_TENSOR.TOKEN_EMBD,
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index b8ec977c..73e02160 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -339,6 +339,12 @@ class GGUFWriter:
def add_clamp_kqv(self, value: float) -> None:
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
+ def add_expert_count(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
+
+ def add_expert_used_count(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
+
def add_layer_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index cc623601..0115ea1c 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -149,6 +149,11 @@ class TensorNameMap:
"model.layers.{bid}.ln2", # yi
),
+ MODEL_TENSOR.FFN_GATE_INP: (
+ "layers.{bid}.feed_forward.gate", # mixtral
+ "model.layers.{bid}.block_sparse_moe.gate", # mixtral
+ ),
+
# Feed-forward up
MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
@@ -164,11 +169,21 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.w1", # qwen
),
+ MODEL_TENSOR.FFN_UP_EXP: (
+ "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
+ "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
+ ),
+
# Feed-forward gate
MODEL_TENSOR.FFN_GATE: (
- "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
- "layers.{bid}.feed_forward.w1", # llama-pth
- "transformer.h.{bid}.mlp.w2", # qwen
+ "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
+ "layers.{bid}.feed_forward.w1", # llama-pth
+ "transformer.h.{bid}.mlp.w2", # qwen
+ ),
+
+ MODEL_TENSOR.FFN_GATE_EXP: (
+ "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
+ "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
),
# Feed-forward down
@@ -185,6 +200,11 @@ class TensorNameMap:
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
),
+ MODEL_TENSOR.FFN_DOWN_EXP: (
+ "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
+ "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
+ ),
+
MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
),
@@ -213,11 +233,14 @@ class TensorNameMap:
for tensor, keys in self.block_mappings_cfg.items():
if tensor not in MODEL_TENSORS[arch]:
continue
- tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
- self.mapping[tensor_name] = (tensor, tensor_name)
- for key in keys:
- key = key.format(bid = bid)
- self.mapping[key] = (tensor, tensor_name)
+ # TODO: make this configurable
+ n_experts = 8
+ for xid in range(n_experts):
+ tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
+ self.mapping[tensor_name] = (tensor, tensor_name)
+ for key in keys:
+ key = key.format(bid = bid, xid = xid)
+ self.mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
result = self.mapping.get(key)
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index e6374bfe..9789c2c8 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
-version = "0.6.0"
+version = "0.7.0"
description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [