diff options
author | Cebtenzzre <cebtenzzre@gmail.com> | 2023-09-29 09:48:45 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-29 16:48:45 +0300 |
commit | 2777a84be429401a2b7d33c2b6a4ada1f0776f1b (patch) | |
tree | 27d6f6a382d4987584f6c6ad2648a901ea00b144 /llama.cpp | |
parent | 0a4a4a098261ddd26480371eaccfe90d1bf6488a (diff) |
llama : quantize up to 31% faster on Linux and Windows with mmap (#3206)
* llama : enable mmap in quantize on Linux -> 31% faster
* also enable mmap on Windows
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 21 |
1 files changed, 17 insertions, 4 deletions
@@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - llama_model_loader ml(fname_inp, /*use_mmap*/ false); + // mmap consistently increases speed Linux, and also increases speed on Windows with + // hot cache. It may cause a slowdown on macOS, possibly related to free memory. +#if defined(__linux__) || defined(_WIN32) + constexpr bool use_mmap = true; +#else + constexpr bool use_mmap = false; +#endif + + llama_model_loader ml(fname_inp, use_mmap); + if (ml.use_mmap) { + ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); + } llama_model model; llm_load_arch(ml, model); @@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(tensor); - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); + if (!ml.use_mmap) { + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); } - tensor->data = read_data.data(); ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", |