diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 67 |
1 files changed, 45 insertions, 22 deletions
@@ -24,6 +24,9 @@ #include <memory> #include <algorithm> #include <initializer_list> +#include <thread> +#include <atomic> +#include <mutex> #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -1572,7 +1575,7 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { ggml_type quantized_type; switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; @@ -1582,6 +1585,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s default: throw format("invalid output file type %d\n", ftype); }; + if (nthread <= 0) { + nthread = std::thread::hardware_concurrency(); + } + std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, /*vocab_only*/ false)); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); @@ -1590,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t total_size_new = 0; std::vector<int64_t> hist_all(1 << 4, 0); + std::vector<std::thread> workers; + std::mutex mutex; + size_t idx = 0; for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { llama_buffer read_data; @@ -1643,25 +1653,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.addr; std::vector<int64_t> hist_cur(1 << 4, 0); - switch (new_type) { - case GGML_TYPE_Q4_0: - { - new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_2: - { - new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - case GGML_TYPE_Q4_3: - { - new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); - } break; - default: - LLAMA_ASSERT(false); + int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1)/chunk_size; + const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + std::vector<int64_t> local_hist; + size_t local_size = 0; + while (true) { + std::unique_lock<std::mutex> lock(mutex); + size_t first = counter; counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j]; + new_size += local_size; + } + break; + } + lock.unlock(); + size_t last = std::min(nelements, first + chunk_size); + if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0); + local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); + } + }; + if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1); + for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute); + compute(); + for (int it = 0; it < nthread_use - 1; ++it) workers[it].join(); } printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); @@ -1783,9 +1805,10 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype) { + enum llama_ftype ftype, + int nthread) { try { - llama_model_quantize_internal(fname_inp, fname_out, ftype); + llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); return 0; } catch (const std::string & err) { fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); |