diff options
author | Pierrick Hymbert <pierrick.hymbert@gmail.com> | 2024-04-26 20:06:33 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-26 20:06:33 +0200 |
commit | 0c4d489e29e53589bf13a801fe7c94b7b546d8f6 (patch) | |
tree | fc83fade919050b3a9471dd892d8aef438c39aaf /examples | |
parent | 017e6999b5184234370b22a2f868e1be911e8d88 (diff) |
quantize: add imatrix and dataset metadata in GGUF (#6658)
* imatrix: save the dataset file used in the output file
* llama: support kv overrides type string string
* common: factorize KV Overrides parsing between common and server
* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656
* llama: remove kv override str_value initialization as it does not compile on some toolchain
* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`
* quantize: add imatrix filename in KV
* llama: add llama_model_kv_override_free
* common: add llama_model_kv_override_free
common: free kv override if used after model loading
* llama: finally move the string KV override value to the stack
* llama : minor
* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.
Co-authored-by: slaren <slarengh@gmail.com>
* kv override: ensure string termination
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
Diffstat (limited to 'examples')
-rw-r--r-- | examples/imatrix/imatrix.cpp | 77 | ||||
-rw-r--r-- | examples/quantize/CMakeLists.txt | 2 | ||||
-rw-r--r-- | examples/quantize/quantize.cpp | 103 | ||||
-rw-r--r-- | examples/server/server.cpp | 36 |
4 files changed, 106 insertions, 112 deletions
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 98c0e93e..71e7a727 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -23,6 +23,7 @@ struct Stats { }; struct StatParams { + std::string dataset; std::string ofile = "imatrix.dat"; int n_output_frequency = 10; int verbosity = 1; @@ -46,7 +47,7 @@ private: std::vector<float> m_src1_data; std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id // - void save_imatrix(const char * file_name) const; + void save_imatrix(const char * file_name, const char * dataset) const; void keep_imatrix(int ncall) const; }; @@ -199,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } void IMatrixCollector::save_imatrix() const { - save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str()); + save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str()); } void IMatrixCollector::keep_imatrix(int ncall) const { @@ -207,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const { if (file_name.empty()) file_name = "imatrix.dat"; file_name += ".at_"; file_name += std::to_string(ncall); - save_imatrix(file_name.c_str()); + save_imatrix(file_name.c_str(), m_params.dataset.c_str()); } -void IMatrixCollector::save_imatrix(const char * fname) const { +void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const { std::ofstream out(fname, std::ios::binary); int n_entries = m_stats.size(); - out.write((const char*)&n_entries, sizeof(n_entries)); - for (auto& p : m_stats) { + out.write((const char *) &n_entries, sizeof(n_entries)); + for (const auto & p : m_stats) { int len = p.first.size(); - out.write((const char*)&len, sizeof(len)); + out.write((const char *) &len, sizeof(len)); out.write(p.first.c_str(), len); - out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); + out.write((const char *) &p.second.ncall, sizeof(p.second.ncall)); int nval = p.second.values.size(); - out.write((const char*)&nval, sizeof(nval)); - if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); + out.write((const char *) &nval, sizeof(nval)); + if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float)); } + + // Write the number of call the matrix was computed with + out.write((const char *) &m_last_call, sizeof(m_last_call)); + + // Write the dataset name at the end of the file to later on specify it in quantize + int n_dataset = strlen(dataset); + out.write((const char *) &n_dataset, sizeof(n_dataset)); + out.write(dataset, n_dataset); + if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname); } } @@ -547,6 +557,29 @@ int main(int argc, char ** argv) { } } + gpt_params params; + params.n_batch = 512; + if (!gpt_params_parse(args.size(), args.data(), params)) { + return 1; + } + + params.logits_all = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); + + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + sparams.dataset = params.prompt_file; g_collector.set_parameters(std::move(sparams)); if (!combine_files.empty()) { @@ -585,28 +618,6 @@ int main(int argc, char ** argv) { } } - gpt_params params; - params.n_batch = 512; - if (!gpt_params_parse(args.size(), args.data(), params)) { - return 1; - } - - params.logits_all = true; - params.n_batch = std::min(params.n_batch, params.n_ctx); - - print_build_info(); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); - } - llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 6f374a2b..6b977fde 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index da1850df..432cc2b4 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -8,7 +8,6 @@ #include <unordered_map> #include <fstream> #include <cmath> -#include <algorithm> struct quant_option { std::string name; @@ -53,6 +52,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = { { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; +static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; @@ -113,7 +116,7 @@ static void usage(const char * executable) { exit(1); } -static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) { +static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) { std::ifstream in(imatrix_file.c_str(), std::ios::binary); if (!in) { printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); @@ -160,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); } } - printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str()); + + // latest imatrix version contains the dataset filename at the end of the file + int m_last_call = 0; + if (in.peek() != EOF) { + in.read((char *)&m_last_call, sizeof(m_last_call)); + int dataset_len; + in.read((char *)&dataset_len, sizeof(dataset_len)); + std::vector<char> dataset_as_vec(dataset_len); + in.read(dataset_as_vec.data(), dataset_len); + imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end()); + printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str()); + } + printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call); + return m_last_call; } -static void prepare_imatrix(const std::string & imatrix_file, +static int prepare_imatrix(const std::string & imatrix_file, + std::string & imatrix_dataset, const std::vector<std::string> & included_weights, const std::vector<std::string> & excluded_weights, std::unordered_map<std::string, std::vector<float>> & imatrix_data) { + int m_last_call = -1; if (!imatrix_file.empty()) { - load_imatrix(imatrix_file, imatrix_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data); } if (imatrix_data.empty()) { - return; + return m_last_call; } if (!excluded_weights.empty()) { for (auto& name : excluded_weights) { @@ -197,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file, if (!imatrix_data.empty()) { printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); } + return m_last_call; } static ggml_type parse_ggml_type(const char * arg) { @@ -211,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) { return result; } -static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) { - const char* sep = strchr(data, '='); - if (sep == nullptr || sep - data >= 128) { - fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); - return false; - } - llama_model_kv_override kvo; - std::strncpy(kvo.key, data, sep - data); - kvo.key[sep - data] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } else { - fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); - return false; - } - } else { - fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); - return false; - } - overrides.emplace_back(std::move(kvo)); - return true; -} - int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -316,10 +298,43 @@ int main(int argc, char ** argv) { usage(argv[0]); } + std::string imatrix_dataset; std::unordered_map<std::string, std::vector<float>> imatrix_data; - prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data); + int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data); if (!imatrix_data.empty()) { params.imatrix = &imatrix_data; + { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + strncpy(kvo.val_str, imatrix_file.c_str(), 127); + kvo.val_str[127] = '\0'; + kv_overrides.emplace_back(std::move(kvo)); + } + if (!imatrix_dataset.empty()) { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + strncpy(kvo.val_str, imatrix_dataset.c_str(), 127); + kvo.val_str[127] = '\0'; + kv_overrides.emplace_back(std::move(kvo)); + } + + { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = imatrix_data.size(); + kv_overrides.emplace_back(std::move(kvo)); + } + + if (m_last_call > 0) { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = m_last_call; + kv_overrides.emplace_back(std::move(kvo)); + } } if (!kv_overrides.empty()) { kv_overrides.emplace_back(); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 48ef8ff2..6f8ba3fc 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2392,7 +2392,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); - printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n"); printf(" --chat-template JINJA_TEMPLATE\n"); @@ -2823,43 +2823,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } - char * sep = strchr(argv[i], '='); - if (sep == nullptr || sep - argv[i] >= 128) { - fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - - struct llama_model_kv_override kvo; - std::strncpy(kvo.key, argv[i], sep - argv[i]); - kvo.key[sep - argv[i]] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } else { - fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - } else { + if (!parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; break; } - params.kv_overrides.push_back(kvo); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); |