YAML result logging + preset script (#2657)

author: Johannes Gäßler <johannesg@5d6.de> 2023-08-28 17:59:39 +0200
committer: GitHub <noreply@github.com> 2023-08-28 17:59:39 +0200
commit: 6b73ef120114beb5664ea94aab48d07ed248ee52 (patch)
tree: 6d9c777a34a43f7b3ad6185df9639bab9be5c5cd /llama.cpp
parent: 75fafcbcccc280a5b3883bc76d0a2dabf474d094 (diff)
1 files changed, 29 insertions, 0 deletions
diff --git a/llama.cpp b/llama.cpp
index da8ff64d..11697ee6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6247,6 +6247,35 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
+void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
+
+    fprintf(stream, "\n");
+    fprintf(stream, "###########\n");
+    fprintf(stream, "# Timings #\n");
+    fprintf(stream, "###########\n");
+    fprintf(stream, "\n");
+
+    fprintf(stream, "mst_eval: %.2f  # ms / token during generation\n",
+            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
+    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
+            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
+    fprintf(stream, "mst_sample: %.2f  # ms / token during sampling\n",
+            1.0e-3 * ctx->t_sample_us / ctx->n_sample);
+    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
+    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
+    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->n_sample);
+    fprintf(stream, "t_eval_us: %ld  # total microseconds spent generating tokens\n", ctx->t_eval_us);
+    fprintf(stream, "t_load_us: %ld  # total microseconds spent loading the model\n", ctx->t_load_us);
+    fprintf(stream, "t_p_eval_us: %ld  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
+    fprintf(stream, "t_sample_us: %ld  # total microseconds spent sampling\n", ctx->t_sample_us);
+    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
+            1.0e6 * ctx->n_eval / ctx->t_eval_us);
+    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
+            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
+    fprintf(stream, "ts_sample: %.2f  # tokens / second during sampling\n",
+            1.0e6 * ctx->n_sample / ctx->t_sample_us);
+}
+
 // For internal test use
 const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
     return ctx->model.tensors_by_name;
author	Johannes Gäßler <johannesg@5d6.de>	2023-08-28 17:59:39 +0200
committer	GitHub <noreply@github.com>	2023-08-28 17:59:39 +0200
commit	6b73ef120114beb5664ea94aab48d07ed248ee52 (patch)
tree	6d9c777a34a43f7b3ad6185df9639bab9be5c5cd /llama.cpp
parent	75fafcbcccc280a5b3883bc76d0a2dabf474d094 (diff)