summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJared Van Bortel <jared@nomic.ai>2024-02-22 17:05:23 -0500
committerGitHub <noreply@github.com>2024-02-22 17:05:23 -0500
commit15499eb94227401bdc8875da6eb85c15d37068f7 (patch)
tree304ceb65978864af454d9b5ad7ad08fc4f673326
parent96633eeca1265ed03e57230de54032041c58f9cd (diff)
mpt : do not duplicate token_embd.weight on disk (#5670)
-rwxr-xr-xconvert-hf-to-gguf.py5
-rw-r--r--llama.cpp6
2 files changed, 4 insertions, 7 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 481198da..9bdfce07 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -622,11 +622,6 @@ class MPTModel(Model):
self.gguf_writer.add_tensor(new_name, data)
- # note: MPT output is tied to (same as) wte in original model;
- # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
- if new_name == "token_embd.weight":
- self.gguf_writer.add_tensor("output.weight", data)
-
class OrionModel(Model):
def set_vocab(self):
diff --git a/llama.cpp b/llama.cpp
index 2ebd40df..37477e6e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -509,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
- { LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -4056,7 +4055,10 @@ static bool llm_load_tensors(
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ // same as tok_embd, duplicated to allow offloading
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ ml.n_created--; // artificial tensor
+ ml.size_data += ggml_nbytes(model.output);
}
for (int i = 0; i < n_layer; ++i) {