diff options
Diffstat (limited to 'examples/quantize/quantize.cpp')
-rw-r--r-- | examples/quantize/quantize.cpp | 84 |
1 files changed, 67 insertions, 17 deletions
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 79e60ea7..eed92c5a 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -87,13 +87,17 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); + printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); + printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --override-kv KEY=TYPE:VALUE\n"); + printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { @@ -107,14 +111,14 @@ static void usage(const char * executable) { exit(1); } -static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) { +static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) { std::ifstream in(imatrix_file.c_str(), std::ios::binary); if (!in) { - printf("%s: failed to open %s\n",__func__,imatrix_file.c_str()); + printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); return; } int n_entries; - in.read((char*)&n_entries, sizeof(n_entries)); + in.read((char *)&n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); return; @@ -124,25 +128,25 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std std::vector<char> name_as_vec(len+1); in.read((char *)name_as_vec.data(), len); if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str()); + printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); return; } name_as_vec[len] = 0; std::string name{name_as_vec.data()}; - auto& e = imatrix_data[std::move(name)]; + auto & e = imatrix_data[std::move(name)]; int ncall; - in.read((char*)&ncall, sizeof(ncall)); + in.read((char *)&ncall, sizeof(ncall)); int nval; in.read((char *)&nval, sizeof(nval)); if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n",__func__,i); + printf("%s: failed reading number of values for entry %d\n", __func__, i); imatrix_data = {}; return; } e.resize(nval); - in.read((char*)e.data(), nval*sizeof(float)); + in.read((char *)e.data(), nval*sizeof(float)); if (in.fail()) { - printf("%s: failed reading data for entry %d\n",__func__,i); + printf("%s: failed reading data for entry %d\n", __func__, i); imatrix_data = {}; return; } @@ -150,13 +154,13 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std for (auto& v : e) v /= ncall; } } - printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str()); + printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str()); } -static void prepare_imatrix(const std::string& imatrix_file, - const std::vector<std::string>& included_weights, - const std::vector<std::string>& excluded_weights, - std::unordered_map<std::string, std::vector<float>>& imatrix_data) { +static void prepare_imatrix(const std::string & imatrix_file, + const std::vector<std::string> & included_weights, + const std::vector<std::string> & excluded_weights, + std::unordered_map<std::string, std::vector<float>> & imatrix_data) { if (!imatrix_file.empty()) { load_imatrix(imatrix_file, imatrix_data); } @@ -201,6 +205,43 @@ static ggml_type parse_ggml_type(const char * arg) { return result; } +static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) { + const char* sep = strchr(data, '='); + if (sep == nullptr || sep - data >= 128) { + fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + return false; + } + llama_model_kv_override kvo; + std::strncpy(kvo.key, data, sep - data); + kvo.key[sep - data] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.int_value = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.float_value = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.bool_value = true; + } else if (std::strcmp(sep, "false") == 0) { + kvo.bool_value = false; + } else { + fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); + return false; + } + } else { + fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + return false; + } + overrides.emplace_back(std::move(kvo)); + return true; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -211,6 +252,7 @@ int main(int argc, char ** argv) { int arg_idx = 1; std::string imatrix_file; std::vector<std::string> included_weights, excluded_weights; + std::vector<llama_model_kv_override> kv_overrides; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -227,6 +269,10 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { + if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { params.allow_requantize = true; } else if (strcmp(argv[arg_idx], "--pure") == 0) { @@ -267,6 +313,11 @@ int main(int argc, char ** argv) { if (!imatrix_data.empty()) { params.imatrix = &imatrix_data; } + if (!kv_overrides.empty()) { + kv_overrides.emplace_back(); + kv_overrides.back().key[0] = 0; + params.kv_overrides = &kv_overrides; + } llama_backend_init(); @@ -288,8 +339,7 @@ int main(int argc, char ** argv) { if (ftype_str == "COPY") { params.only_copy = true; } - } - else { + } else { fname_out = argv[arg_idx]; arg_idx++; |