summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorPierrick Hymbert <pierrick.hymbert@gmail.com>2024-04-26 20:06:33 +0200
committerGitHub <noreply@github.com>2024-04-26 20:06:33 +0200
commit0c4d489e29e53589bf13a801fe7c94b7b546d8f6 (patch)
treefc83fade919050b3a9471dd892d8aef438c39aaf /examples
parent017e6999b5184234370b22a2f868e1be911e8d88 (diff)
quantize: add imatrix and dataset metadata in GGUF (#6658)
* imatrix: save the dataset file used in the output file * llama: support kv overrides type string string * common: factorize KV Overrides parsing between common and server * quantize: add imatrix n entries and dataset KV metadata quantize: factorize KV Overrides parsing between common #6656 * llama: remove kv override str_value initialization as it does not compile on some toolchain * quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count` * quantize: add imatrix filename in KV * llama: add llama_model_kv_override_free * common: add llama_model_kv_override_free common: free kv override if used after model loading * llama: finally move the string KV override value to the stack * llama : minor * no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators. Co-authored-by: slaren <slarengh@gmail.com> * kv override: ensure string termination --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: slaren <slarengh@gmail.com>
Diffstat (limited to 'examples')
-rw-r--r--examples/imatrix/imatrix.cpp77
-rw-r--r--examples/quantize/CMakeLists.txt2
-rw-r--r--examples/quantize/quantize.cpp103
-rw-r--r--examples/server/server.cpp36
4 files changed, 106 insertions, 112 deletions
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 98c0e93e..71e7a727 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -23,6 +23,7 @@ struct Stats {
};
struct StatParams {
+ std::string dataset;
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
@@ -46,7 +47,7 @@ private:
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
//
- void save_imatrix(const char * file_name) const;
+ void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const;
};
@@ -199,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
void IMatrixCollector::save_imatrix() const {
- save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+ save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
}
void IMatrixCollector::keep_imatrix(int ncall) const {
@@ -207,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
- save_imatrix(file_name.c_str());
+ save_imatrix(file_name.c_str(), m_params.dataset.c_str());
}
-void IMatrixCollector::save_imatrix(const char * fname) const {
+void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
- out.write((const char*)&n_entries, sizeof(n_entries));
- for (auto& p : m_stats) {
+ out.write((const char *) &n_entries, sizeof(n_entries));
+ for (const auto & p : m_stats) {
int len = p.first.size();
- out.write((const char*)&len, sizeof(len));
+ out.write((const char *) &len, sizeof(len));
out.write(p.first.c_str(), len);
- out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
+ out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
- out.write((const char*)&nval, sizeof(nval));
- if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
+ out.write((const char *) &nval, sizeof(nval));
+ if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
}
+
+ // Write the number of call the matrix was computed with
+ out.write((const char *) &m_last_call, sizeof(m_last_call));
+
+ // Write the dataset name at the end of the file to later on specify it in quantize
+ int n_dataset = strlen(dataset);
+ out.write((const char *) &n_dataset, sizeof(n_dataset));
+ out.write(dataset, n_dataset);
+
if (m_params.verbosity > 0) {
- fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
+ fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
}
}
@@ -547,6 +557,29 @@ int main(int argc, char ** argv) {
}
}
+ gpt_params params;
+ params.n_batch = 512;
+ if (!gpt_params_parse(args.size(), args.data(), params)) {
+ return 1;
+ }
+
+ params.logits_all = true;
+ params.n_batch = std::min(params.n_batch, params.n_ctx);
+
+ print_build_info();
+
+ if (params.seed == LLAMA_DEFAULT_SEED) {
+ params.seed = time(NULL);
+ }
+
+ fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) {
@@ -585,28 +618,6 @@ int main(int argc, char ** argv) {
}
}
- gpt_params params;
- params.n_batch = 512;
- if (!gpt_params_parse(args.size(), args.data(), params)) {
- return 1;
- }
-
- params.logits_all = true;
- params.n_batch = std::min(params.n_batch, params.n_ctx);
-
- print_build_info();
-
- if (params.seed == LLAMA_DEFAULT_SEED) {
- params.seed = time(NULL);
- }
-
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
-
- std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = gpt_random_prompt(rng);
- }
-
llama_backend_init();
llama_numa_init(params.numa);
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 6f374a2b..6b977fde 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index da1850df..432cc2b4 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,7 +8,6 @@
#include <unordered_map>
#include <fstream>
#include <cmath>
-#include <algorithm>
struct quant_option {
std::string name;
@@ -53,6 +52,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
};
+static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str;
@@ -113,7 +116,7 @@ static void usage(const char * executable) {
exit(1);
}
-static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@@ -160,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
}
}
- printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
+
+ // latest imatrix version contains the dataset filename at the end of the file
+ int m_last_call = 0;
+ if (in.peek() != EOF) {
+ in.read((char *)&m_last_call, sizeof(m_last_call));
+ int dataset_len;
+ in.read((char *)&dataset_len, sizeof(dataset_len));
+ std::vector<char> dataset_as_vec(dataset_len);
+ in.read(dataset_as_vec.data(), dataset_len);
+ imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
+ printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
+ }
+ printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
+ return m_last_call;
}
-static void prepare_imatrix(const std::string & imatrix_file,
+static int prepare_imatrix(const std::string & imatrix_file,
+ std::string & imatrix_dataset,
const std::vector<std::string> & included_weights,
const std::vector<std::string> & excluded_weights,
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+ int m_last_call = -1;
if (!imatrix_file.empty()) {
- load_imatrix(imatrix_file, imatrix_data);
+ m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
}
if (imatrix_data.empty()) {
- return;
+ return m_last_call;
}
if (!excluded_weights.empty()) {
for (auto& name : excluded_weights) {
@@ -197,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
if (!imatrix_data.empty()) {
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
}
+ return m_last_call;
}
static ggml_type parse_ggml_type(const char * arg) {
@@ -211,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) {
return result;
}
-static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
- const char* sep = strchr(data, '=');
- if (sep == nullptr || sep - data >= 128) {
- fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
- return false;
- }
- llama_model_kv_override kvo;
- std::strncpy(kvo.key, data, sep - data);
- kvo.key[sep - data] = 0;
- sep++;
- if (strncmp(sep, "int:", 4) == 0) {
- sep += 4;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
- kvo.int_value = std::atol(sep);
- } else if (strncmp(sep, "float:", 6) == 0) {
- sep += 6;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
- kvo.float_value = std::atof(sep);
- } else if (strncmp(sep, "bool:", 5) == 0) {
- sep += 5;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
- if (std::strcmp(sep, "true") == 0) {
- kvo.bool_value = true;
- } else if (std::strcmp(sep, "false") == 0) {
- kvo.bool_value = false;
- } else {
- fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
- return false;
- }
- } else {
- fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
- return false;
- }
- overrides.emplace_back(std::move(kvo));
- return true;
-}
-
int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
@@ -316,10 +298,43 @@ int main(int argc, char ** argv) {
usage(argv[0]);
}
+ std::string imatrix_dataset;
std::unordered_map<std::string, std::vector<float>> imatrix_data;
- prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
+ int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data;
+ {
+ llama_model_kv_override kvo;
+ std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+ strncpy(kvo.val_str, imatrix_file.c_str(), 127);
+ kvo.val_str[127] = '\0';
+ kv_overrides.emplace_back(std::move(kvo));
+ }
+ if (!imatrix_dataset.empty()) {
+ llama_model_kv_override kvo;
+ std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+ strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
+ kvo.val_str[127] = '\0';
+ kv_overrides.emplace_back(std::move(kvo));
+ }
+
+ {
+ llama_model_kv_override kvo;
+ std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+ kvo.val_i64 = imatrix_data.size();
+ kv_overrides.emplace_back(std::move(kvo));
+ }
+
+ if (m_last_call > 0) {
+ llama_model_kv_override kvo;
+ std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+ kvo.val_i64 = m_last_call;
+ kv_overrides.emplace_back(std::move(kvo));
+ }
}
if (!kv_overrides.empty()) {
kv_overrides.emplace_back();
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 48ef8ff2..6f8ba3fc 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2392,7 +2392,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
- printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+ printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
printf(" --chat-template JINJA_TEMPLATE\n");
@@ -2823,43 +2823,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true;
break;
}
- char * sep = strchr(argv[i], '=');
- if (sep == nullptr || sep - argv[i] >= 128) {
- fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
- invalid_param = true;
- break;
- }
-
- struct llama_model_kv_override kvo;
- std::strncpy(kvo.key, argv[i], sep - argv[i]);
- kvo.key[sep - argv[i]] = 0;
- sep++;
- if (strncmp(sep, "int:", 4) == 0) {
- sep += 4;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
- kvo.int_value = std::atol(sep);
- } else if (strncmp(sep, "float:", 6) == 0) {
- sep += 6;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
- kvo.float_value = std::atof(sep);
- } else if (strncmp(sep, "bool:", 5) == 0) {
- sep += 5;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
- if (std::strcmp(sep, "true") == 0) {
- kvo.bool_value = true;
- } else if (std::strcmp(sep, "false") == 0) {
- kvo.bool_value = false;
- } else {
- fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
- invalid_param = true;
- break;
- }
- } else {
+ if (!parse_kv_override(argv[i], params.kv_overrides)) {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
- params.kv_overrides.push_back(kvo);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);