diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 1497 |
1 files changed, 772 insertions, 725 deletions
@@ -1,49 +1,26 @@ +#include "llama_util.h" #include "llama.h" +#include "llama_internal.h" #include "ggml.h" +#include <array> #include <cinttypes> #include <fstream> #include <random> #include <map> #include <unordered_map> #include <queue> -#include <regex> #include <cassert> #include <cstring> - -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) -#define WIN32_LEAN_AND_MEAN -#include <Windows.h> -#else -#include <sys/types.h> -#include <sys/mman.h> -#include <unistd.h> -#include <fcntl.h> -#endif - -#define Min(X, Y) ((Y) > (X) ? (X) : (Y)) -#define Max(X, Y) ((Y) < (X) ? (X) : (Y)) +#include <climits> +#include <memory> +#include <algorithm> +#include <initializer_list> #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 -#define LLAMA_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - - -// determine number of model parts based on the dimension -static const std::unordered_map<int, int> LLAMA_N_PARTS = { - { 4096, 1 }, - { 5120, 2 }, - { 6656, 4 }, - { 8192, 8 }, -}; // available llama models enum e_model { @@ -93,14 +70,18 @@ static const std::map<e_model, size_t> MEM_REQ_EVAL = { // default hparams (LLaMA 7B) struct llama_hparams { - int32_t n_vocab = 32000; - int32_t n_ctx = 512; // this is provided as user input? - int32_t n_embd = 4096; - int32_t n_mult = 256; - int32_t n_head = 32; - int32_t n_layer = 32; - int32_t n_rot = 64; - int32_t f16 = 1; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 256; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + uint32_t f16 = 1; + + bool operator!=(const llama_hparams & other) const { + return memcmp(this, &other, sizeof(llama_hparams)); + } }; struct llama_layer { @@ -126,11 +107,17 @@ struct llama_kv_cache { struct ggml_tensor * k; struct ggml_tensor * v; - struct ggml_context * ctx; + struct ggml_context * ctx = NULL; - std::vector<uint8_t> buf; + llama_buffer buf; int n; // number of tokens currently in the cache + + ~llama_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + } }; struct llama_model { @@ -146,22 +133,30 @@ struct llama_model { std::vector<llama_layer> layers; // context - struct ggml_context * ctx; + struct ggml_context * ctx = NULL; // key + value cache for the self attention // TODO: move to llama_state struct llama_kv_cache kv_self; // the model memory buffer - std::vector<uint8_t> buf; + llama_buffer buf; // model memory mapped file - void * mm_addr = NULL; - uint64_t mm_length = 0; + std::unique_ptr<llama_mmap> mapping; - // tensors - int n_loaded; - std::unordered_map<std::string, struct ggml_tensor *> tensors; + // objects representing data potentially being locked in memory + llama_mlock mlock_buf; + llama_mlock mlock_mmap; + + // for quantize-stats only + std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name; + + ~llama_model() { + if (ctx) { + ggml_free(ctx); + } + } }; struct llama_vocab { @@ -206,8 +201,8 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state - std::vector<uint8_t> buf_compute; - std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + llama_buffer buf_compute; + llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; @@ -220,11 +215,11 @@ struct llama_context { last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); } else { auto & buf = buf_scratch[i]; - last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), }); + last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); } if (buf_last >= 0) { - buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size); + buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); } buf_last = i; @@ -244,6 +239,508 @@ struct llama_context { } }; +template <typename T> +static T checked_mul(T a, T b) { + T ret = a * b; + if (a != 0 && ret / a != b) { + throw format("overflow multiplying %llu * %llu", + (unsigned long long) a, (unsigned long long) b); + } + return ret; +} + +static size_t checked_div(size_t a, size_t b) { + if (b == 0 || a % b != 0) { + throw format("error dividing %zu / %zu", a, b); + } + return a / b; +} + +static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) { + std::string ret = "[" + std::to_string(ne.at(0)); + for (size_t i = 1; i < ne.size(); i++) { + ret += " x " + std::to_string(ne.at(i)); + } + ret += "]"; + return ret; +} + +static const char * llama_format_type(enum ggml_type type) { + switch (type) { + case GGML_TYPE_F32: return "f32"; + case GGML_TYPE_F16: return "f16"; + case GGML_TYPE_Q4_0: return "q4_0"; + case GGML_TYPE_Q4_1: return "q4_1"; + default: LLAMA_ASSERT(false); + } +} + +static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) { + size_t size = ggml_type_size(type); + for (uint32_t dim : ne) { + size = checked_mul<size_t>(size, dim); + } + return size / ggml_blck_size(type); +} + +struct llama_load_tensor_shard { + std::vector<uint32_t> ne; + size_t size; + enum ggml_type type; + size_t file_idx; + size_t file_off; + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +enum llama_split_type { + SPLIT_NONE, + SPLIT_BY_COLUMNS, + SPLIT_BY_ROWS +}; + +struct llama_load_tensor { + std::vector<llama_load_tensor_shard> shards; + + std::string name; + enum ggml_type type = GGML_TYPE_F32; + llama_split_type split_type = SPLIT_NONE; + std::vector<uint32_t> ne; + size_t size; + struct ggml_tensor * ggml_tensor = NULL; + uint8_t * data; + + llama_load_tensor(const std::string & name) : name(name) {} + + void calc_all() { + calc_type(); + calc_split_type(); + calc_ne(); + calc_size(); + } + + void calc_type() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.type != first_shard.type) { + throw format("inconsistent tensor shard type in '%s'", name.c_str()); + } + } + type = first_shard.type; + } + + void calc_split_type() { + if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file + shards.size() == 1) { // only one file? + split_type = SPLIT_NONE; + } else if (name.find("tok_embeddings.") == 0 || + name.find(".attention.wo.weight") != std::string::npos || + name.find(".feed_forward.w2.weight") != std::string::npos) { + split_type = SPLIT_BY_COLUMNS; + } else { + split_type = SPLIT_BY_ROWS; + } + } + + void calc_ne() { + const auto & first_shard = shards.at(0); + for (const auto & shard : shards) { + if (shard.ne != first_shard.ne) { + throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", + name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()); + } + } + ne = first_shard.ne; + LLAMA_ASSERT(shards.size() <= UINT32_MAX); + uint32_t n_shards = (uint32_t) shards.size(); + switch (split_type) { + case SPLIT_NONE: + ne = first_shard.ne; + break; + case SPLIT_BY_COLUMNS: + ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards), + first_shard.ne[1]}; + break; + case SPLIT_BY_ROWS: + ne = {first_shard.ne[0], + checked_mul<uint32_t>(first_shard.ne[1], n_shards)}; + break; + } + } + + void calc_size() { + size = llama_calc_tensor_size(ne, type); + } +}; + +struct llama_load_tensors_map { + // tensors is kept in a separate vector to preserve file order + std::vector<llama_load_tensor> tensors; + std::unordered_map<std::string, size_t> name_to_idx; +}; + +enum llama_file_version { + LLAMA_FILE_VERSION_GGML, + LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_FILE_VERSION_GGJT_V1, // added padding +}; + +struct llama_file_loader { + llama_file file; + llama_file_version file_version; + llama_hparams hparams; + llama_vocab vocab; + + llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + : file(fname, "rb") { + fprintf(stderr, "llama.cpp: loading model from %s\n", fname); + read_magic(); + read_hparams(); + read_vocab(); + read_tensor_metadata(file_idx, tensors_map); + } + void read_magic() { + uint32_t magic = file.read_u32(); + uint32_t version = 0; + + if (magic != 'ggml') { + version = file.read_u32(); + } + + if (magic == 'ggml' && version == 0) { + file_version = LLAMA_FILE_VERSION_GGML; + } else if (magic == 'ggmf' && version == 1) { + file_version = LLAMA_FILE_VERSION_GGMF_V1; + } else if (magic == 'ggjt' && version == 1) { + file_version = LLAMA_FILE_VERSION_GGJT_V1; + } else { + throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version); + } + } + void read_hparams() { + hparams.n_vocab = file.read_u32(); + hparams.n_embd = file.read_u32(); + hparams.n_mult = file.read_u32(); + hparams.n_head = file.read_u32(); + hparams.n_layer = file.read_u32(); + hparams.n_rot = file.read_u32(); + hparams.f16 = file.read_u32(); + } + void read_vocab() { + vocab.id_to_token.resize(hparams.n_vocab); + + for (uint32_t i = 0; i < hparams.n_vocab; i++) { + uint32_t len = file.read_u32(); + std::string word = file.read_string(len); + + float score = 0.0f; + if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) { + file.read_raw(&score, sizeof(score)); + } + + vocab.token_to_id[word] = i; + + auto & tok_score = vocab.id_to_token[i]; + tok_score.tok = std::move(word); + tok_score.score = score; + } + } + void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { + while (file.tell() < file.size) { + llama_load_tensor_shard shard; + uint32_t n_dims = file.read_u32(); + uint32_t name_len = file.read_u32(); + uint32_t ftype = file.read_u32(); + shard.ne.resize(n_dims); + file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); + std::string name = file.read_string(name_len); + if (n_dims < 1 || n_dims > 2) { + throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); + } + switch (ftype) { + case 0: shard.type = GGML_TYPE_F32; break; + case 1: shard.type = GGML_TYPE_F16; break; + case 2: shard.type = GGML_TYPE_Q4_0; break; + case 3: shard.type = GGML_TYPE_Q4_1; break; + default: { + throw format("unrecognized ftype %u\n", ftype); + } + } + + if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { + // skip to the next multiple of 32 bytes + file.seek(-file.tell() & 31, SEEK_CUR); + } + shard.file_idx = file_idx; + shard.file_off = file.tell(); + + shard.calc_size(); + file.seek(shard.size, SEEK_CUR); + + auto it = tensors_map.name_to_idx.find(name); + size_t idx; + if (it != tensors_map.name_to_idx.end()) { + idx = it->second; + } else { + tensors_map.tensors.emplace_back(name); + idx = tensors_map.tensors.size() - 1; + tensors_map.name_to_idx.emplace(name, idx); + } + tensors_map.tensors.at(idx).shards.push_back(shard); + } + } +}; + +struct llama_file_saver { + llama_file file; + llama_file_loader * any_file_loader; + llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16) + : file(fname, "wb"), any_file_loader(any_file_loader) { + fprintf(stderr, "llama.cpp: saving model to %s\n", fname); + write_magic(); + write_hparams(new_f16); + write_vocab(); + } + void write_magic() { + file.write_u32('ggjt'); // magic + file.write_u32(1); // version + } + void write_hparams(uint32_t new_f16) { + const llama_hparams & hparams = any_file_loader->hparams; + file.write_u32(hparams.n_vocab); + file.write_u32(hparams.n_embd); + file.write_u32(hparams.n_mult); + file.write_u32(hparams.n_head); + file.write_u32(hparams.n_layer); + file.write_u32(hparams.n_rot); + file.write_u32(new_f16); + } + void write_vocab() { + if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { + fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); + } + uint32_t n_vocab = any_file_loader->hparams.n_vocab; + for (uint32_t i = 0; i < n_vocab; i++) { + const auto & token_score = any_file_loader->vocab.id_to_token.at(i); + file.write_u32((uint32_t) token_score.tok.size()); + file.write_raw(token_score.tok.data(), token_score.tok.size()); + file.write_raw(&token_score.score, sizeof(token_score.score)); + } + } + void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { + uint32_t ftype; + switch (new_type) { + case GGML_TYPE_F32: ftype = 0; break; + case GGML_TYPE_F16: ftype = 1; break; + case GGML_TYPE_Q4_0: ftype = 2; break; + case GGML_TYPE_Q4_1: ftype = 3; break; + default: LLAMA_ASSERT(false); + } + file.write_u32((uint32_t) tensor.ne.size()); + file.write_u32((uint32_t) tensor.name.size()); + file.write_u32(ftype); + file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); + file.write_raw(tensor.name.data(), tensor.name.size()); + file.seek(-file.tell() & 31, SEEK_CUR); + LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); + file.write_raw(new_data, new_size); + } +}; + +struct llama_model_loader { + std::vector<std::unique_ptr<llama_file_loader>> file_loaders; + llama_load_tensors_map tensors_map; + bool use_mmap; + size_t num_ggml_tensors_created = 0; + struct ggml_context * ggml_ctx = NULL; + std::unique_ptr<llama_mmap> mapping; + + llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { + auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + file_loaders.emplace_back(first_file); + uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); + for (uint32_t i = 1; i < n_parts; i++) { + std::string fname = fname_base + "." + std::to_string(i); + auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + file_loaders.emplace_back(ith_file); + if (ith_file->hparams != first_file->hparams) { + throw format("llama.cpp: hparams inconsistent between files"); + } + } + if (!llama_mmap::SUPPORTED) { + use_mmap = false; + } + if (use_mmap && alignment_prevents_mmap()) { + fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n"); + use_mmap = false; + } + this->use_mmap = use_mmap; + for (llama_load_tensor & lt : tensors_map.tensors) { + lt.calc_all(); + } + } + + bool alignment_prevents_mmap() { + for (const llama_load_tensor & lt : tensors_map.tensors) { + for (const llama_load_tensor_shard & shard : lt.shards) { + if (shard.file_off & 3) { + return true; + } + } + } + return false; + } + + uint32_t guess_n_parts() const { + auto it = tensors_map.name_to_idx.find("tok_embeddings.weight"); + if (it == tensors_map.name_to_idx.end()) { + throw std::string("missing tok_embeddings.weight"); + } + const llama_load_tensor & lt = tensors_map.tensors.at(it->second); + return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); + } + + void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { + *ctx_size_p = *mmapped_size_p = 0; + for (const llama_load_tensor & lt : tensors_map.tensors) { + *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + } + } + + struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) { + auto it = tensors_map.name_to_idx.find(name); + if (it == tensors_map.name_to_idx.end()) { + throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); + } + llama_load_tensor & lt = tensors_map.tensors.at(it->second); + if (lt.ne != ne) { + throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", + name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); + } + return get_tensor_for(lt); + } + + struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { + struct ggml_tensor * tensor; + if (lt.ne.size() == 2) { + tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); + } else { + LLAMA_ASSERT(lt.ne.size() == 1); + tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); + } + LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + lt.ggml_tensor = tensor; + num_ggml_tensors_created++; + return tensor; + } + + void done_getting_tensors() { + if (num_ggml_tensors_created != tensors_map.tensors.size()) { + throw std::string("llama.cpp: file contained more tensors than expected"); + } + } + + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + size_t data_size = 0; + for (const llama_load_tensor & lt : tensors_map.tensors) { + data_size += lt.size; + } + + if (use_mmap) { + mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + if (!lmlock) { + // Don't call the callback since the actual loading will be lazy + // and we can't measure it. + progress_callback = NULL; + } + if (lmlock) { + lmlock->init(mapping->addr); + } + } + + size_t done_size = 0; + for (llama_load_tensor & lt : tensors_map.tensors) { + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already + lt.data = (uint8_t *) lt.ggml_tensor->data; + load_data_for(lt); + lt.ggml_tensor->data = lt.data; + done_size += lt.size; + if (use_mmap && lmlock) { + lmlock->grow_to(done_size); + } + } + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + } + + void load_data_for(llama_load_tensor & lt) { + if (use_mmap) { + LLAMA_ASSERT(lt.shards.size() == 1); + lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; + } else if (lt.split_type == SPLIT_NONE) { + llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; + file.seek(lt.shards.at(0).file_off, SEEK_SET); + file.read_raw(lt.data, lt.size); + } else if (lt.split_type == SPLIT_BY_ROWS) { + size_t offset = 0; + for (llama_load_tensor_shard & shard : lt.shards) { + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + file.read_raw(lt.data + offset, shard.size); + offset += shard.size; + } + LLAMA_ASSERT(offset == lt.size); + } else if (lt.split_type == SPLIT_BY_COLUMNS) { + // Let's load the data into temporary buffers to ensure the OS performs large loads. + std::vector<llama_buffer> tmp_bufs; + tmp_bufs.resize(lt.shards.size()); + for (size_t i = 0; i < lt.shards.size(); i++) { + llama_load_tensor_shard & shard = lt.shards.at(i); + llama_file & file = file_loaders.at(shard.file_idx)->file; + file.seek(shard.file_off, SEEK_SET); + tmp_bufs.at(i).resize(shard.size); + file.read_raw(tmp_bufs.at(i).addr, shard.size); + } + // Then reshape. + size_t num_rows = lt.ne.at(1); + size_t per_shard_row_size = lt.shards.at(0).size / num_rows; + size_t out_offset = 0; + for (size_t row = 0; row < num_rows; row++) { + for (llama_buffer & tmp_buf : tmp_bufs) { + memcpy(lt.data + out_offset, + tmp_buf.addr + row * per_shard_row_size, + per_shard_row_size); + out_offset += per_shard_row_size; + } + } + LLAMA_ASSERT(out_offset == lt.size); + } + if (0) { + print_checksum(lt); + } + } + + static void print_checksum(llama_load_tensor & lt) { + uint32_t sum = 0; + for (size_t i = 0; i < lt.size; i++) { + uint8_t byte = lt.data[i]; + sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash + } + fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, + llama_format_tensor_shape(lt.ne).c_str(), lt.size); + } + +}; + + // // kv cache // @@ -262,8 +759,8 @@ static bool kv_cache_init( cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); struct ggml_init_params params; - params.mem_size = cache.buf.size(); - params.mem_buffer = cache.buf.data(); + params.mem_size = cache.buf.size; + params.mem_buffer = cache.buf.addr; params.no_alloc = false; cache.ctx = ggml_init(params); @@ -279,13 +776,6 @@ static bool kv_cache_init( return true; } -static void kv_cache_free(struct llama_kv_cache & cache) { - if (cache.ctx) { - ggml_free(cache.ctx); - cache.ctx = nullptr; - } -} - struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, @@ -294,6 +784,7 @@ struct llama_context_params llama_context_default_params() { /*.f16_kv =*/ false, /*.logits_all =*/ false, /*.vocab_only =*/ false, + /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, /*.progress_callback =*/ nullptr, @@ -303,243 +794,71 @@ struct llama_context_params llama_context_default_params() { return result; } -// -// model loading -// - -static void *mmap_file(const char *fname, uint64_t *mm_length) { -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) - HANDLE hFile = CreateFileA(fname, - GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, - OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED, - NULL); - if (hFile == INVALID_HANDLE_VALUE) return 0; - LARGE_INTEGER fileSize; - fileSize.QuadPart = -1; - GetFileSizeEx(hFile, &fileSize); - int64_t length = fileSize.QuadPart; - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - CloseHandle(hFile); - if (!hMapping) return 0; - void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - CloseHandle(hMapping); - if (!addr) return 0; -#else - int fd = open(fname, O_RDONLY); - if (fd == -1) return 0; - int64_t length = lseek(fd, 0, SEEK_END); - void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); - close(fd); - if (addr == MAP_FAILED) return 0; -#endif - *mm_length = length; - return addr; +bool llama_mmap_supported() { + return llama_mmap::SUPPORTED; } -static void munmap_file(void * addr, size_t length) { -#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) - UnmapViewOfFile(addr); -#else - munmap(addr, length); -#endif +bool llama_mlock_supported() { + return llama_mlock::SUPPORTED; } -static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) { - fprintf(stderr, - "%s: invalid model file (bad magic [got %#x want %#x])\n" - "\tyou most likely need to regenerate your ggml files\n" - "\tthe benefit is you'll get 10-100x faster load times\n" - "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n" - "\tuse convert-pth-to-ggml.py to regenerate from original pth\n" - "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n", - path, got, want); - return false; -} +// +// model loading +// -static bool llama_model_load( +static void llama_model_load_internal( const std::string & fname, llama_context & lctx, int n_ctx, - int n_parts, ggml_type memory_type, + bool use_mmap, + bool use_mlock, bool vocab_only, llama_progress_callback progress_callback, - void *progress_callback_user_data) { - fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + void * progress_callback_user_data) { lctx.t_start_us = ggml_time_us(); - auto & model = lctx.model; - auto & vocab = lctx.vocab; + std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only)); - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - std::vector<char> f_buf(1024*1024); - fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); - - fin.seekg(0, fin.end); - const size_t file_size = fin.tellg(); - fin.seekg(0); + lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); + auto & model = lctx.model; + auto & hparams = model.hparams; + hparams = ml->file_loaders.at(0)->hparams; + uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - // verify magic { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) { - fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n", - __func__, fname.c_str()); - return false; - } - if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = e_model::MODEL_65B; break; } - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); - - if (format_version != LLAMA_FILE_VERSION) { - fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", - __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION); - return false; - } - } - - int n_ff = 0; - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); - hparams.n_ctx = n_ctx; - n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - - if (n_parts < 1) { - n_parts = LLAMA_N_PARTS.at(hparams.n_embd); - } - - // temp warning to tell the user to use "--n_parts" - if (hparams.f16 == 4 && n_parts != 1) { - fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts); - fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__); - } - - if (hparams.n_layer == 32) { - model.type = e_model::MODEL_7B; - } - - if (hparams.n_layer == 40) { - model.type = e_model::MODEL_13B; - } - - if (hparams.n_layer == 60) { - model.type = e_model::MODEL_30B; - } - - if (hparams.n_layer == 80) { - model.type = e_model::MODEL_65B; - } - - fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); - fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); - fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); - fprintf(stderr, "%s: type = %d\n", __func__, model.type); - } - - // load vocab - { - std::string word; - vocab.id_to_token.resize(model.hparams.n_vocab); - std::vector<char> tmp(64); - - for (int i = 0; i < model.hparams.n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - word.resize(len); - if (len > 0) { - tmp.resize(len); - fin.read(tmp.data(), len); - word.assign(tmp.data(), len); - } else { - word.clear(); - } - - float score; - fin.read((char *) &score, sizeof(score)); - - vocab.token_to_id[word] = i; - - auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word; - tok_score.score = score; - } + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); + fprintf(stderr, "%s: type = %u\n", __func__, model.type); } if (vocab_only) { - return true; - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - // wtype is for per-layer weights, while vtype is for other weights - ggml_type wtype, vtype; - switch (model.hparams.f16) { - case 0: wtype = vtype = GGML_TYPE_F32; break; - case 1: wtype = vtype = GGML_TYPE_F16; break; - case 2: wtype = vtype = GGML_TYPE_Q4_0; break; - case 3: wtype = vtype = GGML_TYPE_Q4_1; break; - case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break; - default: - { - fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", - __func__, fname.c_str(), model.hparams.f16); - return false; - } - } - - // map model into memory - char *mm_addr = NULL; - model.mm_addr = mmap_file(fname.c_str(), &model.mm_length); - if (model.mm_addr == NULL) { - fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str()); - return false; + return; } - mm_addr = (char *)model.mm_addr; - fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0)); auto & ctx = model.ctx; - size_t ctx_size = 0; - { - const auto &hparams = model.hparams; - const int n_layer = hparams.n_layer; - ctx_size += (5 + 10*n_layer)*256; // object overhead - fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); - } + size_t ctx_size, mmapped_size; + ml->calc_sizes(&ctx_size, &mmapped_size); + fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); // print memory requirements { @@ -548,7 +867,7 @@ static bool llama_model_load( // this is the total memory required to run the inference const size_t mem_required = ctx_size + - model.mm_length + + mmapped_size + MEM_REQ_SCRATCH0.at(model.type) + MEM_REQ_SCRATCH1.at(model.type) + MEM_REQ_EVAL.at (model.type); @@ -564,17 +883,20 @@ static bool llama_model_load( // create the ggml context { lctx.model.buf.resize(ctx_size); + if (use_mlock) { + lctx.model.mlock_buf.init(lctx.model.buf.addr); + lctx.model.mlock_buf.grow_to(lctx.model.buf.size); + } struct ggml_init_params params = { - /*.mem_size =*/ lctx.model.buf.size(), - /*.mem_buffer =*/ lctx.model.buf.data(), - /*.no_alloc =*/ true, + /*.mem_size =*/ lctx.model.buf.size, + /*.mem_buffer =*/ lctx.model.buf.addr, + /*.no_alloc =*/ ml->use_mmap, }; model.ctx = ggml_init(params); if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; + throw format("ggml_init() failed"); } } @@ -582,161 +904,71 @@ static bool llama_model_load( { const auto & hparams = model.hparams; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); - - model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; - // map by name - model.tensors["tok_embeddings.weight"] = model.tok_embeddings; + ml->ggml_ctx = ctx; - model.tensors["norm.weight"] = model.norm; - model.tensors["output.weight"] = model.output; + model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); + model.norm = ml->get_tensor("norm.weight", {n_embd}); + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); - for (int i = 0; i < n_layer; ++i) { + model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + std::string layers_i = "layers." + std::to_string(i); - layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); - layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); - layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); - layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); - layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); + layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; - - model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); } } - std::vector<uint8_t> tmp; + ml->done_getting_tensors(); - if (progress_callback) { - progress_callback(0.0, progress_callback_user_data); + // populate `tensors_by_name` + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); } - fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str()); - - // load weights - { - size_t total_size = 0; - model.n_loaded = 0; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); - fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast<char *>(&length), sizeof(length)); - fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); - return false; - } - - auto tensor = model.tensors[name.data()]; - - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - return false; - } - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n", - __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); - return false; - } - if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]); - } - - switch (ftype) { - case 0: // f32 - case 1: // f16 - break; - case 2: // q4_0 - case 3: // q4_1 - assert(ne[0] % 64 == 0); - break; - default: - fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); - return false; - }; - - // load the tensor data into memory without copying or reading it - size_t offset = fin.tellg(); - size_t tensor_data_size = ggml_nbytes(tensor); - offset = (offset + 31) & -32; - tensor->data = mm_addr + offset; - fin.seekg(offset + tensor_data_size); - total_size += tensor_data_size; - model.n_loaded++; - - // progress - if (progress_callback) { - double current_progress = size_t(fin.tellg()) / double(file_size); - progress_callback(current_progress, progress_callback_user_data); - } - } - - fin.close(); - - fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded); - if (model.n_loaded == 0) { - fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); - } else if (model.n_loaded != (int) model.tensors.size()) { - fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded); - return false; - } - } + model.mapping = std::move(ml->mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration lctx.t_load_us = ggml_time_us() - lctx.t_start_us; +} - if (progress_callback) { - progress_callback(1.0, progress_callback_user_data); +static bool llama_model_load( + const std::string & fname, + llama_context & lctx, + int n_ctx, + ggml_type memory_type, + bool use_mmap, + bool use_mlock, + bool vocab_only, + llama_progress_callback progress_callback, + void *progress_callback_user_data) { + try { + llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock, + vocab_only, progress_callback, progress_callback_user_data); + return true; + } catch (const std::string & err) { + fprintf(stderr, "error loading model: %s\n", err.c_str()); + return false; } - - return true; } // evaluate the transformer @@ -774,8 +1006,8 @@ static bool llama_eval_internal( auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size(), - /*.mem_buffer =*/ buf_compute.data(), + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, /*.no_alloc =*/ false, }; @@ -1061,7 +1293,7 @@ struct llama_tokenizer { size_t offs = 0; while (offs < text.size()) { llama_sp_symbol sym; - size_t char_len = Min(text.size() - offs, utf8_len(text[offs])); + size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); sym.text = text.c_str() + offs; sym.n = char_len; offs += char_len; @@ -1236,7 +1468,7 @@ static llama_vocab::id llama_sample_top_p_top_k( } } - sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits); + sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits); // compute probs for the top k tokens std::vector<float> probs; @@ -1284,298 +1516,118 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -// TODO: reuse code from the llama_model_load() somehow -static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { - ggml_type type = GGML_TYPE_Q4_1; - +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { + ggml_type quantized_type; switch (itype) { - case 2: type = GGML_TYPE_Q4_0; break; - case 3: type = GGML_TYPE_Q4_1; break; - default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; + case 2: quantized_type = GGML_TYPE_Q4_0; break; + case 3: quantized_type = GGML_TYPE_Q4_1; break; + default: throw format("invalid quantization type %d\n", itype); }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { - fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); - return false; - } - - llama_vocab vocab; - - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); - - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); - return false; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - finp.read((char *) &magic, sizeof(magic)); - if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) { - fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", - __func__, fname_inp.c_str()); - return false; - } - if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC); - } - - fout.write((char *) &magic, sizeof(magic)); - - uint32_t format_version; - finp.read((char *) &format_version, sizeof(format_version)); - - if (format_version != LLAMA_FILE_VERSION) { - fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", - __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION); - return false; - } - - fout.write((char *) &format_version, sizeof(format_version)); - } - - llama_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_mult = %d\n", __func__, hparams.n_mult); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); - - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult)); - fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); - fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &itype, sizeof(hparams.f16)); - } - - // load vocab - { - const int32_t n_vocab = hparams.n_vocab; - - if (n_vocab != hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); - return false; - } - - std::vector<char> word(32); - vocab.id_to_token.resize(n_vocab); - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read ((char *) &len, sizeof(len)); - fout.write((char *) &len, sizeof(len)); - - word.resize(len); - finp.read ((char *) &word[0], len); - fout.write((char *) &word[0], len); - - float score; - finp.read ((char *) &score, sizeof(score)); - fout.write((char *) &score, sizeof(score)); - - vocab.token_to_id[word.data()] = i; - - auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word.data(); - tok_score.score = score; - } - } - - // load weights - { - size_t total_size_org = 0; - size_t total_size_new = 0; - - std::vector<float> work; - - std::vector<uint8_t> data_u8; - std::vector<ggml_fp16_t> data_f16; - std::vector<float> data_f32; - - std::vector<int64_t> hist_all(1 << 4, 0); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); - finp.read(reinterpret_cast<char *>(&length), sizeof(length)); - finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype)); - - if (finp.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - finp.read (&name[0], length); - - { - // ensure tensor data is aligned - uint64_t offset = finp.tellg(); - offset = (offset + 31) & -32; - finp.seekg(offset); - } - - { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); - } - - // regexes of tensor names to be quantized - const std::vector<std::string> k_names = { - ".*weight", - }; - - bool quantize = false; - for (const auto & s : k_names) { - if (std::regex_match(name, std::regex(s))) { - quantize = true; - break; - } - } - - // quantize only 2D tensors - quantize &= (n_dims == 2); - - if (quantize) { - if (ftype != 0 && ftype != 1) { - fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); - return false; - } - - if (ftype == 1) { - data_f16.resize(nelements); - finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t)); - data_f32.resize(nelements); - for (int i = 0; i < nelements; ++i) { - data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); - } - } else { - data_f32.resize(nelements); - finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float)); + std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, + /*vocab_only*/ false)); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype); + + size_t total_size_org = 0; + size_t total_size_new = 0; + std::vector<int64_t> hist_all(1 << 4, 0); + + size_t idx = 0; + for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { + llama_buffer read_data; + read_data.resize(tensor.size); + tensor.data = read_data.addr; + model_loader->load_data_for(tensor); + + printf("[%zu/%zu] %36s - %s, type = %6s, ", + ++idx, model_loader->tensors_map.tensors.size(), + tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), + llama_format_type(tensor.type)); + + // This used to be a regex, but <regex> has an extreme cost to compile times. + bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? + + // quantize only 2D tensors + quantize &= (tensor.ne.size() == 2); + + enum ggml_type new_type; + void * new_data; + size_t new_size; + llama_buffer work; + + if (!quantize) { + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } else { + new_type = quantized_type; + float * f32_data; + size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); + llama_buffer f32_conv_buf; + if (tensor.type == GGML_TYPE_F32) { + f32_data = (float *) tensor.data; + } else if (tensor.type == GGML_TYPE_F16) { + f32_conv_buf.resize(nelements * sizeof(float)); + f32_data = (float *) f32_conv_buf.addr; + auto f16_data = (const ggml_fp16_t *) tensor.data; + for (size_t i = 0; i < nelements; i++) { + f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } - - ftype = itype; } else { - const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); - - data_u8.resize(nelements*bpe); - finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe); + throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type)); } - fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); - fout.write(reinterpret_cast<char *>(&length), sizeof(length)); - fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype)); - for (int i = 0; i < n_dims; ++i) { - fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); + printf("quantizing .. "); + fflush(stdout); + + work.resize(nelements * 4); // upper bound on size + new_data = work.addr; + std::vector<int64_t> hist_cur(1 << 4, 0); + + switch (new_type) { + case GGML_TYPE_Q4_0: + { + new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; + case GGML_TYPE_Q4_1: + { + new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + } break; + default: + LLAMA_ASSERT(false); } - fout.write(&name[0], length); - { - // ensure tensor data is aligned - uint64_t offset = fout.tellp(); - offset = (offset + 31) & -32; - fout.seekp(offset); + printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + for (size_t i = 0; i < hist_cur.size(); i++) { + hist_all[i] += hist_cur[i]; } - if (quantize) { - printf("quantizing .. "); - work.resize(nelements); // for quantization - - size_t cur_size = 0; - std::vector<int64_t> hist_cur(1 << 4, 0); - - switch (type) { - case GGML_TYPE_Q4_0: - { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - default: - { - fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); - return false; - } - } - - fout.write(reinterpret_cast<char *>(work.data()), cur_size); - total_size_new += cur_size; - - printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < (int) hist_cur.size(); ++i) { - hist_all[i] += hist_cur[i]; - } - - for (int i = 0; i < (int) hist_cur.size(); ++i) { - printf("%5.3f ", hist_cur[i] / float(nelements)); - } - printf("\n"); - } else { - printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); - fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size()); - total_size_new += data_u8.size(); + for (size_t i = 0; i < hist_cur.size(); i++) { + printf("%5.3f ", hist_cur[i] / float(nelements)); } - - total_size_org += nelements * sizeof(float); + printf("\n"); } + total_size_org += tensor.size; + total_size_new += new_size; + file_saver.write_tensor(tensor, new_type, new_data, new_size); + } - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); - { - int64_t sum_all = 0; - for (int i = 0; i < (int) hist_all.size(); ++i) { - sum_all += hist_all[i]; - } + { + int64_t sum_all = 0; + for (size_t i = 0; i < hist_all.size(); i++) { + sum_all += hist_all[i]; + } - printf("%s: hist: ", __func__); - for (int i = 0; i < (int) hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / float(sum_all)); - } - printf("\n"); + printf("%s: hist: ", __func__); + for (size_t i = 0; i < hist_all.size(); i++) { + printf("%5.3f ", hist_all[i] / float(sum_all)); } + printf("\n"); } - - finp.close(); - fout.close(); - - return true; } // @@ -1593,32 +1645,36 @@ struct llama_context * llama_init_from_file( params.seed = time(NULL); } + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + ++*cur_percentage_p; + fprintf(stderr, "."); + fflush(stderr); + if (percentage >= 100) { + fprintf(stderr, "\n"); + } + } + }; + } + ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type, - params.vocab_only, params.progress_callback, - params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type, + params.use_mmap, params.use_mlock, params.vocab_only, + params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); llama_free(ctx); return nullptr; } - if (params.use_mlock) { - char *err; - if (!ggml_mlock(ctx->model.ctx, - ctx->model.mm_addr, - ctx->model.mm_length, - &err)) { - fprintf(stderr, "%s\n", err); - free(err); - llama_free(ctx); - return nullptr; - } - } - // reserve memory for context buffers if (!params.vocab_only) { if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { @@ -1655,16 +1711,6 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { - kv_cache_free(ctx->model.kv_self); - - if (ctx->model.ctx) { - ggml_free(ctx->model.ctx); - } - - if (ctx->model.mm_addr) { - munmap_file(ctx->model.mm_addr, ctx->model.mm_length); - } - delete ctx; } @@ -1672,23 +1718,24 @@ int llama_model_quantize( const char * fname_inp, const char * fname_out, int itype) { - if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) { - fprintf(stderr, "%s: failed to quantize\n", __func__); + try { + llama_model_quantize_internal(fname_inp, fname_out, itype); + return 0; + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); return 1; } - - return 0; } // Returns the KV cache that will contain the context for the // ongoing prediction with the model. const uint8_t * llama_get_kv_cache(struct llama_context * ctx) { - return ctx->model.kv_self.buf.data(); + return ctx->model.kv_self.buf.addr; } // Returns the size of the KV cache size_t llama_get_kv_cache_size(struct llama_context * ctx) { - return ctx->model.kv_self.buf.size(); + return ctx->model.kv_self.buf.size; } int llama_get_kv_cache_token_count(struct llama_context * ctx) { @@ -1702,8 +1749,8 @@ void llama_set_kv_cache( size_t n_size, int n_token_count) { // Make sure we have the same kv cache setup - LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size); - memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size); + LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size); + memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size); ctx->model.kv_self.n = n_token_count; } @@ -1814,9 +1861,9 @@ llama_token llama_sample_top_p_top_k( void llama_print_timings(struct llama_context * ctx) { const int64_t t_end_us = ggml_time_us(); - const int32_t n_sample = Max(1, ctx->n_sample); - const int32_t n_eval = Max(1, ctx->n_eval); - const int32_t n_p_eval = Max(1, ctx->n_p_eval); + const int32_t n_sample = std::max(1, ctx->n_sample); + const int32_t n_eval = std::max(1, ctx->n_eval); + const int32_t n_p_eval = std::max(1, ctx->n_p_eval); fprintf(stderr, "\n"); fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); @@ -1854,6 +1901,6 @@ const char * llama_print_system_info(void) { } // For internal test use -std::unordered_map<std::string, struct ggml_tensor *>& llama_internal_get_tensor_map(struct llama_context * ctx) { - return ctx->model.tensors; +std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) { + return ctx->model.tensors_by_name; } |