summaryrefslogtreecommitdiff
path: root/main.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'main.cpp')
-rw-r--r--main.cpp273
1 files changed, 199 insertions, 74 deletions
diff --git a/main.cpp b/main.cpp
index eca71408..d28fc916 100644
--- a/main.cpp
+++ b/main.cpp
@@ -11,6 +11,14 @@
#include <string>
#include <vector>
+// determine number of model parts based on the dimension
+static const std::map<int, int> LLAMA_N_PARTS = {
+ { 4096, 1 },
+ { 5120, 2 },
+ { 6656, 4 },
+ { 8192, 8 },
+};
+
// default hparams (LLaMA 7B)
struct llama_hparams {
int32_t n_vocab = 32000;
@@ -82,6 +90,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
}
int n_ff = 0;
+ int n_parts = 0;
// load hparams
{
@@ -99,6 +108,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
hparams.n_ctx = n_ctx;
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
+ n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@@ -109,6 +119,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: f16 = %d\n", __func__, hparams.f16);
printf("%s: n_ff = %d\n", __func__, n_ff);
+ printf("%s: n_parts = %d\n", __func__, n_parts);
}
// load vocab
@@ -220,7 +231,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
model.layers.resize(n_layer);
- model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
+ model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model.output = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
@@ -234,14 +245,14 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];
- layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+ layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
- layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
- layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
- layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
- layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+ layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+ layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+ layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+ layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
- layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+ layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
@@ -282,94 +293,208 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
}
- // load weights
- {
- int n_tensors = 0;
- size_t total_size = 0;
+ const size_t file_offset = fin.tellg();
- printf("%s: ", __func__);
+ fin.close();
- while (true) {
- int32_t n_dims;
- int32_t length;
- int32_t ftype;
+ std::vector<uint8_t> tmp;
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
+ for (int i = 0; i < n_parts; ++i) {
+ const int part_id = i;
+ //const int part_id = n_parts - i - 1;
- if (fin.eof()) {
- break;
- }
+ std::string fname_part = fname;
+ if (i > 0) {
+ fname_part += "." + std::to_string(i);
+ }
- int32_t nelements = 1;
- int32_t ne[2] = { 1, 1 };
- for (int i = 0; i < n_dims; ++i) {
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- nelements *= ne[i];
- }
+ printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
- std::string name(length, 0);
- fin.read(&name[0], length);
+ fin = std::ifstream(fname_part, std::ios::binary);
+ fin.seekg(file_offset);
- if (model.tensors.find(name.data()) == model.tensors.end()) {
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
- return false;
- }
+ // load weights
+ {
+ int n_tensors = 0;
+ size_t total_size = 0;
- auto tensor = model.tensors[name.data()];
- if (ggml_nelements(tensor) != nelements) {
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
- return false;
- }
+ printf("%s: ", __func__);
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
- return false;
- }
+ while (true) {
+ int32_t n_dims;
+ int32_t length;
+ int32_t ftype;
- if (0) {
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
- printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
- }
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
+
+ if (fin.eof()) {
+ break;
+ }
+
+ int32_t nelements = 1;
+ int32_t ne[2] = { 1, 1 };
+ for (int i = 0; i < n_dims; ++i) {
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+ nelements *= ne[i];
+ }
- size_t bpe = 0;
+ std::string name(length, 0);
+ fin.read(&name[0], length);
- switch (ftype) {
- case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
- case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
- case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
- case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
- default:
- {
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+ return false;
+ }
+
+ // split_type = 0: split by columns
+ // split_type = 1: split by rows
+ int split_type = 0;
+
+ // split_type = 0:
+ // regex:
+ // - tok_embeddings.*
+ // - layers.*.attention.wo.weight
+ // - layers.*.feed_forward.w2.weight
+
+ // split_type = 1:
+ // regex:
+ // - output.*
+ // - layers.*.attention.wq.weight
+ // - layers.*.attention.wk.weight
+ // - layers.*.attention.wv.weight
+ // - layers.*.feed_forward.w1.weight
+ // - layers.*.feed_forward.w3.weight
+ if (name.find("tok_embeddings") != std::string::npos) {
+ split_type = 0;
+ } else if (name.find("layers") != std::string::npos) {
+ if (name.find("attention.wo.weight") != std::string::npos) {
+ split_type = 0;
+ } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
+ split_type = 0;
+ } else {
+ split_type = 1;
+ }
+ } else if (name.find("output") != std::string::npos) {
+ split_type = 1;
+ }
+
+ auto tensor = model.tensors[name.data()];
+
+ if (n_dims == 1) {
+ if (ggml_nelements(tensor) != nelements) {
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+ return false;
+ }
+ } else {
+ if (ggml_nelements(tensor)/n_parts != nelements) {
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+ return false;
+ }
+ }
+
+ if (n_dims == 1) {
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+ __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+ return false;
+ }
+ } else {
+ if (split_type == 0) {
+ if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+ __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
+ return false;
+ }
+ } else {
+ if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+ __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
return false;
}
- };
+ }
+ }
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
- __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
- return false;
- }
+ if (0) {
+ static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+ printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
+ }
+
+ size_t bpe = 0;
+
+ switch (ftype) {
+ case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
+ case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
+ case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
+ case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+ default:
+ {
+ fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
+ return false;
+ }
+ };
+
+ if (n_dims == 1 || n_parts == 1) {
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+ __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+ return false;
+ }
+
+ if (part_id == 0) {
+ fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+ } else {
+ fin.seekg(ggml_nbytes(tensor), std::ios::cur);
+ }
+
+ total_size += ggml_nbytes(tensor);
+ } else {
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+ __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
+ return false;
+ }
+
+ if (split_type == 0) {
+ const int np0 = ne[0];
+
+ const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
+ assert(row_size == tensor->nb[1]);
+
+ for (int i1 = 0; i1 < ne[1]; ++i1) {
+ const size_t offset_row = i1*row_size;
+ const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
+ fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
+ }
+ } else {
+ const int np1 = ne[1];
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+ const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
- //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
- total_size += ggml_nbytes(tensor);
- if (++n_tensors % 8 == 0) {
- printf(".");
- fflush(stdout);
+ for (int i1 = 0; i1 < ne[1]; ++i1) {
+ const size_t offset_row = (i1 + part_id*np1)*row_size;
+ fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
+ }
+ }
+
+ total_size += ggml_nbytes(tensor)/n_parts;
+ }
+
+ //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+ if (++n_tensors % 8 == 0) {
+ printf(".");
+ fflush(stdout);
+ }
}
- }
- printf(" done\n");
+ printf(" done\n");
- printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
- }
+ printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+ }
- fin.close();
+ fin.close();
+ }
return true;
}