summaryrefslogtreecommitdiff
path: root/convert.py
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2024-04-03 15:07:05 +0200
committerGitHub <noreply@github.com>2024-04-03 16:07:05 +0300
commit08a0c0206075556e82aca0feafad530dcc5f1426 (patch)
tree3937cd263076c548ba25348253dcec6d355b8fef /convert.py
parent52604860f93063ef98863921da697576af1c7665 (diff)
ggml : mul_mat_id use the same tensor for all the experts (#6387)
* ggml : update mul_mat_id to use the same tensor for all the experts * update cuda * minor * update metal * update test-backend-ops * fix cuda * Update ggml-metal.m Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update convert.py * update convert-hf-to-gguf.py * update convert.py for mixtral hf models * Update convert-hf-to-gguf.py Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * cuda : support non-pow-2 number of experts * allow quantize to work for split and merged experts models in the same way * cleanup + disable mmap automatically with split tensors models * update imatrix * test-backend-ops : test qwen argsort * update grok model loading * llama : add merged experts tensors to the grok tensor map * minor * gguf : bump version * fix quantizing of merged experts * convert-hf-to-gguf.py : update grok (untested) * make linter happy * cuda/argsort : use shared memory instead of pool memory * convert : fix grok tensor names * metal : add support for non-pow-2 argsort * llama : more loader cleanup, better error checking * cuda : fix warning * llama : still use mmap for loading old models, but copy the data to a host buffer * add review note * llama : remove ffn tensor counting + add sanity check ggml-ci * convert : fix handling of n_experts == None ggml-ci * imatrix : fix ncall counters * llama : produce error if imatrix size does not match * quantize : terminate on errors + trace logs ggml-ci * metal : pad shared memory to 16 bytes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'convert.py')
-rwxr-xr-xconvert.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/convert.py b/convert.py
index d3a9ccaf..244eb758 100755
--- a/convert.py
+++ b/convert.py
@@ -828,6 +828,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
+def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
+ def load() -> Tensor:
+ tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
+ return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
+ s = lazy_tensors[0].shape.copy()
+ s.insert(0, len(lazy_tensors))
+ return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
+
+
# Functionality that simulates `torch.load` but where individual tensors are
# only loaded into memory on demand, not all at once.
# PyTorch can't do this natively as of time of writing:
@@ -1246,6 +1255,22 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tmp = model
+ # merge experts into one tensor
+ if params.n_experts and params.n_experts > 0:
+ for i_l in range(params.n_layer):
+ for w in range(1, 4):
+ experts = []
+ for e in range(params.n_experts):
+ if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
+ experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
+ del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
+ elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
+ experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
+ del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
+ else:
+ raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
+ tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
+
# HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model: