diff options
-rw-r--r-- | examples/quantize-stats/quantize-stats.cpp | 45 | ||||
-rw-r--r-- | src/llama.cpp | 34 |
2 files changed, 73 insertions, 6 deletions
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6264deb4..88a7d2b9 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -17,6 +17,7 @@ #include <vector> #include <thread> #include <mutex> +#include <array> #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -227,6 +228,31 @@ static void test_roundtrip_on_layer( } } +static void print_fp_stats(const char * msg, const uint64_t * counts) { + printf("===== %s\n", msg); + uint64_t tot = 0; for (int i = 0; i < 32; ++i) tot += counts[i]; + double norm = 1./tot; + for (int i = 0; i < 32; ++i) { + if (!counts[i]) continue; + uint16_t val = i << 10; + float f = ggml_fp16_to_fp32(val); + printf("%2d %f %g\n", i, norm*counts[i], f); + } +} + +static void analyze_tensor_fp(const ggml_tensor * t, uint64_t * H) { + if (t->type != GGML_TYPE_F16) return; + if (!ggml_is_contiguous(t)) return; + int n = ggml_nelements(t); + const uint16_t * x = (const uint16_t *)t->data; + std::array<uint64_t, 32> counts = {}; + for (int j = 0; j < n; ++j) { + ++counts[(x[j] >> 10) & 31]; + } + for (int i = 0; i < 32; ++i) H[i] += counts[i]; + print_fp_stats(t->name, counts.data()); +} + int main(int argc, char ** argv) { ggml_time_init(); @@ -236,6 +262,7 @@ int main(int argc, char ** argv) { int max_thread = 0; bool invalid_param = false; + bool analyze_fp = false; std::string arg; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -249,6 +276,8 @@ int main(int argc, char ** argv) { params.verbose = true; } else if (arg == "-p" || arg == "--per-layer-stats") { params.per_layer_stats = true; + } else if (arg == "-afp" || arg == "--analyze-fp") { + analyze_fp = true; } else if (arg == "--histogram") { params.print_histogram = true; } else if (arg == "-m" || arg == "--model") { @@ -375,6 +404,22 @@ int main(int argc, char ** argv) { std::vector<char> quantized_scratch; std::vector<float> output_scratch; + if (analyze_fp) { + for (const auto& kv_tensor : tensors) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (kv_tensor.second->ne[0] == 1 || kv_tensor.second->ne[1] == 1) { + // we never quantize those + continue; + } + std::array<uint64_t, 32> H = {}; + analyze_tensor_fp(kv_tensor.second, H.data()); + print_fp_stats("Total", H.data()); + } + return 0; + } + // loop throught quantization types for (int i = 0; i < GGML_TYPE_COUNT; i++) { const ggml_type type = (ggml_type) i; diff --git a/src/llama.cpp b/src/llama.cpp index 80104303..789e2a7c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5882,18 +5882,40 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); if (ml.n_elements >= 1e12) { - LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12); + LLAMA_LOG_INFO("%s: model params = %.3f T\n", __func__, ml.n_elements*1e-12); } else if (ml.n_elements >= 1e9) { - LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); + LLAMA_LOG_INFO("%s: model params = %.3f B\n", __func__, ml.n_elements*1e-9); } else if (ml.n_elements >= 1e6) { - LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6); + LLAMA_LOG_INFO("%s: model params = %.3f M\n", __func__, ml.n_elements*1e-6); } else { - LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3); + LLAMA_LOG_INFO("%s: model params = %.3f K\n", __func__, ml.n_elements*1e-3); } if (ml.n_bytes < GiB) { - LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + LLAMA_LOG_INFO("%s: model size = %.3f MiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } else { - LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + LLAMA_LOG_INFO("%s: model size = %.3f GiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + } + { + auto n_bytes = ml.n_bytes; + auto n_elements = ml.n_elements; + auto meta_tke = ml.get_tensor_meta("token_embd.weight"); + auto meta_out = ml.get_tensor_meta("output.weight"); + if (meta_tke && meta_out) { + n_bytes -= ggml_nbytes(meta_tke); + n_elements -= ggml_nelements(meta_tke); + n_bytes -= ggml_nbytes(meta_out); + n_elements -= ggml_nelements(meta_out); + if (n_bytes < GiB) { + LLAMA_LOG_INFO("%s: repeating layers = %.3f MiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements); + } else { + LLAMA_LOG_INFO("%s: repeating layers = %.3f GiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements); + } + if (ml.n_elements >= 1e9) { + printf(", %.3f B parameters)\n", n_elements*1e-9); + } else { + printf(", %.3f M parameters)\n", n_elements*1e-6); + } + } } // general kv |