diff options
Diffstat (limited to 'main.cpp')
-rw-r--r-- | main.cpp | 273 |
1 files changed, 199 insertions, 74 deletions
@@ -11,6 +11,14 @@ #include <string> #include <vector> +// determine number of model parts based on the dimension +static const std::map<int, int> LLAMA_N_PARTS = { + { 4096, 1 }, + { 5120, 2 }, + { 6656, 4 }, + { 8192, 8 }, +}; + // default hparams (LLaMA 7B) struct llama_hparams { int32_t n_vocab = 32000; @@ -82,6 +90,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab } int n_ff = 0; + int n_parts = 0; // load hparams { @@ -99,6 +108,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab hparams.n_ctx = n_ctx; n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; + n_parts = LLAMA_N_PARTS.at(hparams.n_embd); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -109,6 +119,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab printf("%s: n_rot = %d\n", __func__, hparams.n_rot); printf("%s: f16 = %d\n", __func__, hparams.f16); printf("%s: n_ff = %d\n", __func__, n_ff); + printf("%s: n_parts = %d\n", __func__, n_parts); } // load vocab @@ -220,7 +231,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab model.layers.resize(n_layer); - model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model.output = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); @@ -234,14 +245,14 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); @@ -282,94 +293,208 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); } - // load weights - { - int n_tensors = 0; - size_t total_size = 0; + const size_t file_offset = fin.tellg(); - printf("%s: ", __func__); + fin.close(); - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; + std::vector<uint8_t> tmp; - fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast<char *>(&length), sizeof(length)); - fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype)); + for (int i = 0; i < n_parts; ++i) { + const int part_id = i; + //const int part_id = n_parts - i - 1; - if (fin.eof()) { - break; - } + std::string fname_part = fname; + if (i > 0) { + fname_part += "." + std::to_string(i); + } - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } + printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); - std::string name(length, 0); - fin.read(&name[0], length); + fin = std::ifstream(fname_part, std::ios::binary); + fin.seekg(file_offset); - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); - return false; - } + // load weights + { + int n_tensors = 0; + size_t total_size = 0; - auto tensor = model.tensors[name.data()]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - return false; - } + printf("%s: ", __func__); - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); - return false; - } + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; - if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } + fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast<char *>(&length), sizeof(length)); + fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } - size_t bpe = 0; + std::string name(length, 0); + fin.read(&name[0], length); - switch (ftype) { - case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; - case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; - case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; - case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; - default: - { - fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; + } + + // split_type = 0: split by columns + // split_type = 1: split by rows + int split_type = 0; + + // split_type = 0: + // regex: + // - tok_embeddings.* + // - layers.*.attention.wo.weight + // - layers.*.feed_forward.w2.weight + + // split_type = 1: + // regex: + // - output.* + // - layers.*.attention.wq.weight + // - layers.*.attention.wk.weight + // - layers.*.attention.wv.weight + // - layers.*.feed_forward.w1.weight + // - layers.*.feed_forward.w3.weight + if (name.find("tok_embeddings") != std::string::npos) { + split_type = 0; + } else if (name.find("layers") != std::string::npos) { + if (name.find("attention.wo.weight") != std::string::npos) { + split_type = 0; + } else if (name.find("feed_forward.w2.weight") != std::string::npos) { + split_type = 0; + } else { + split_type = 1; + } + } else if (name.find("output") != std::string::npos) { + split_type = 1; + } + + auto tensor = model.tensors[name.data()]; + + if (n_dims == 1) { + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return false; + } + } else { + if (ggml_nelements(tensor)/n_parts != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return false; + } + } + + if (n_dims == 1) { + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); + return false; + } + } else { + if (split_type == 0) { + if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]); + return false; + } + } else { + if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]); return false; } - }; + } + } - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); - return false; - } + if (0) { + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; + printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); + } + + size_t bpe = 0; + + switch (ftype) { + case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; + case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; + case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; + case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; + default: + { + fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); + return false; + } + }; + + if (n_dims == 1 || n_parts == 1) { + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + if (part_id == 0) { + fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor)); + } else { + fin.seekg(ggml_nbytes(tensor), std::ios::cur); + } + + total_size += ggml_nbytes(tensor); + } else { + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe); + return false; + } + + if (split_type == 0) { + const int np0 = ne[0]; + + const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); + assert(row_size == tensor->nb[1]); + + for (int i1 = 0; i1 < ne[1]; ++i1) { + const size_t offset_row = i1*row_size; + const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); + fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts); + } + } else { + const int np1 = ne[1]; - fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor)); + const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); - //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); - total_size += ggml_nbytes(tensor); - if (++n_tensors % 8 == 0) { - printf("."); - fflush(stdout); + for (int i1 = 0; i1 < ne[1]; ++i1) { + const size_t offset_row = (i1 + part_id*np1)*row_size; + fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size); + } + } + + total_size += ggml_nbytes(tensor)/n_parts; + } + + //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + if (++n_tensors % 8 == 0) { + printf("."); + fflush(stdout); + } } - } - printf(" done\n"); + printf(" done\n"); - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); - } + printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + } - fin.close(); + fin.close(); + } return true; } |