diff options
author | slaren <slarengh@gmail.com> | 2024-04-03 15:07:05 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-03 16:07:05 +0300 |
commit | 08a0c0206075556e82aca0feafad530dcc5f1426 (patch) | |
tree | 3937cd263076c548ba25348253dcec6d355b8fef /examples/quantize/quantize.cpp | |
parent | 52604860f93063ef98863921da697576af1c7665 (diff) |
ggml : mul_mat_id use the same tensor for all the experts (#6387)
* ggml : update mul_mat_id to use the same tensor for all the experts
* update cuda
* minor
* update metal
* update test-backend-ops
* fix cuda
* Update ggml-metal.m
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* update convert.py
* update convert-hf-to-gguf.py
* update convert.py for mixtral hf models
* Update convert-hf-to-gguf.py
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* cuda : support non-pow-2 number of experts
* allow quantize to work for split and merged experts models in the same way
* cleanup + disable mmap automatically with split tensors models
* update imatrix
* test-backend-ops : test qwen argsort
* update grok model loading
* llama : add merged experts tensors to the grok tensor map
* minor
* gguf : bump version
* fix quantizing of merged experts
* convert-hf-to-gguf.py : update grok (untested)
* make linter happy
* cuda/argsort : use shared memory instead of pool memory
* convert : fix grok tensor names
* metal : add support for non-pow-2 argsort
* llama : more loader cleanup, better error checking
* cuda : fix warning
* llama : still use mmap for loading old models, but copy the data to a host buffer
* add review note
* llama : remove ffn tensor counting + add sanity check
ggml-ci
* convert : fix handling of n_experts == None
ggml-ci
* imatrix : fix ncall counters
* llama : produce error if imatrix size does not match
* quantize : terminate on errors + trace logs
ggml-ci
* metal : pad shared memory to 16 bytes
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'examples/quantize/quantize.cpp')
-rw-r--r-- | examples/quantize/quantize.cpp | 16 |
1 files changed, 10 insertions, 6 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 80c493f1..64cb6db1 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -116,13 +116,13 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st std::ifstream in(imatrix_file.c_str(), std::ios::binary); if (!in) { printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); - return; + exit(1); } int n_entries; in.read((char *)&n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); - return; + exit(1); } for (int i = 0; i < n_entries; ++i) { int len; in.read((char *)&len, sizeof(len)); @@ -130,11 +130,11 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st in.read((char *)name_as_vec.data(), len); if (in.fail()) { printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); - return; + exit(1); } name_as_vec[len] = 0; std::string name{name_as_vec.data()}; - auto & e = imatrix_data[std::move(name)]; + auto & e = imatrix_data[name]; int ncall; in.read((char *)&ncall, sizeof(ncall)); int nval; @@ -142,18 +142,22 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st if (in.fail() || nval < 1) { printf("%s: failed reading number of values for entry %d\n", __func__, i); imatrix_data = {}; - return; + exit(1); } e.resize(nval); in.read((char *)e.data(), nval*sizeof(float)); if (in.fail()) { printf("%s: failed reading data for entry %d\n", __func__, i); imatrix_data = {}; - return; + exit(1); } if (ncall > 0) { for (auto& v : e) v /= ncall; } + + if (getenv("LLAMA_TRACE")) { + printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); + } } printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str()); } |