summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
Diffstat (limited to 'examples')
-rw-r--r--examples/main/main.cpp78
-rw-r--r--examples/perplexity/perplexity.cpp141
-rw-r--r--examples/server/server.cpp2
3 files changed, 192 insertions, 29 deletions
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 3ce57f43..89cc4f60 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -17,6 +17,7 @@
#include <ctime>
#include <fstream>
#include <iostream>
+#include <sstream>
#include <string>
#include <vector>
@@ -36,9 +37,57 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
-static llama_context ** g_ctx;
+static llama_context ** g_ctx;
+static llama_model ** g_model;
+static gpt_params * g_params;
+static std::vector<llama_token> * g_input_tokens;
+static std::ostringstream * g_output_ss;
+static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
+void write_logfile(
+ const llama_context * ctx, const gpt_params & params, const llama_model * model,
+ const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
+
+ if (params.logdir.empty()) {
+ return;
+ }
+
+ const std::string timestamp = get_sortable_timestamp();
+
+ const bool success = create_directory_with_parents(params.logdir);
+ if (!success) {
+ fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+ __func__, params.logdir.c_str());
+ return;
+ }
+
+ const std::string logfile_path = params.logdir + timestamp + ".yml";
+ FILE * logfile = fopen(logfile_path.c_str(), "w");
+
+ if (logfile == NULL) {
+ fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+ return;
+ }
+
+ fprintf(logfile, "binary: main\n");
+ char model_desc[128];
+ llama_model_desc(model, model_desc, sizeof(model_desc));
+ dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+
+ fprintf(logfile, "\n");
+ fprintf(logfile, "######################\n");
+ fprintf(logfile, "# Generation Results #\n");
+ fprintf(logfile, "######################\n");
+ fprintf(logfile, "\n");
+
+ dump_string_yaml_multiline(logfile, "output", output.c_str());
+ dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+
+ llama_dump_timing_info_yaml(logfile, ctx);
+ fclose(logfile);
+}
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
if (signo == SIGINT) {
@@ -48,6 +97,7 @@ void sigint_handler(int signo) {
console::cleanup();
printf("\n");
llama_print_timings(*g_ctx);
+ write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
_exit(130);
}
}
@@ -56,6 +106,7 @@ void sigint_handler(int signo) {
int main(int argc, char ** argv) {
gpt_params params;
+ g_params = &params;
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
@@ -116,6 +167,7 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
llama_context * ctx_guidance = NULL;
+ g_model = &model;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
@@ -397,6 +449,10 @@ int main(int argc, char ** argv) {
int n_session_consumed = 0;
int n_past_guidance = 0;
+ std::vector<int> input_tokens; g_input_tokens = &input_tokens;
+ std::vector<int> output_tokens; g_output_tokens = &output_tokens;
+ std::ostringstream output_ss; g_output_ss = &output_ss;
+
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
@@ -667,7 +723,15 @@ int main(int argc, char ** argv) {
// display text
if (input_echo) {
for (auto id : embd) {
- printf("%s", llama_token_to_piece(ctx, id).c_str());
+ const std::string token_str = llama_token_to_piece(ctx, id);
+ printf("%s", token_str.c_str());
+
+ if (embd.size() > 1) {
+ input_tokens.push_back(id);
+ } else {
+ output_tokens.push_back(id);
+ output_ss << token_str;
+ }
}
fflush(stdout);
}
@@ -761,6 +825,8 @@ int main(int argc, char ** argv) {
printf("%s", params.input_suffix.c_str());
}
+ const size_t original_size = embd_inp.size();
+
// instruct mode: insert instruction prefix
if (params.instruct && !is_antiprompt) {
n_consumed = embd_inp.size();
@@ -775,6 +841,12 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
+ for (size_t i = original_size; i < embd_inp.size(); ++i) {
+ const llama_token token = embd_inp[i];
+ output_tokens.push_back(token);
+ output_ss << llama_token_to_piece(ctx, token);
+ }
+
n_remain -= line_inp.size();
}
@@ -817,6 +889,8 @@ int main(int argc, char ** argv) {
}
llama_print_timings(ctx);
+ write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+
if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index ebafa0c2..aeb774c5 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -3,16 +3,79 @@
#include "build-info.h"
#include <cmath>
+#include <cstdio>
+#include <cstring>
#include <ctime>
#include <sstream>
-#include <cstring>
#include <thread>
#include <mutex>
+#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
+struct results_perplexity {
+ std::vector<llama_token> tokens;
+ double ppl_value;
+ std::vector<float> logits;
+ std::vector<float> probs;
+};
+
+struct results_log_softmax {
+ double log_softmax;
+ float logit;
+ float prob;
+};
+
+void write_logfile(const llama_context * ctx, const gpt_params & params,
+ const llama_model * model, const struct results_perplexity & results) {
+
+ if (params.logdir.empty()) {
+ return;
+ }
+
+ if (params.hellaswag) {
+ fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+ return;
+ }
+
+ const std::string timestamp = get_sortable_timestamp();
+
+ const bool success = create_directory_with_parents(params.logdir);
+ if (!success) {
+ fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+ __func__, params.logdir.c_str());
+ return;
+ }
+
+ const std::string logfile_path = params.logdir + timestamp + ".yml";
+ FILE * logfile = fopen(logfile_path.c_str(), "w");
+
+ if (logfile == NULL) {
+ fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+ return;
+ }
+
+ fprintf(logfile, "binary: main\n");
+ char model_desc[128];
+ llama_model_desc(model, model_desc, sizeof(model_desc));
+ dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
+
+ fprintf(logfile, "\n");
+ fprintf(logfile, "######################\n");
+ fprintf(logfile, "# Perplexity Results #\n");
+ fprintf(logfile, "######################\n");
+ fprintf(logfile, "\n");
+
+ dump_vector_float_yaml(logfile, "logits", results.logits);
+ fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
+ dump_vector_float_yaml(logfile, "probs", results.probs);
+
+ llama_dump_timing_info_yaml(logfile, ctx);
+ fclose(logfile);
+}
+
std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];
@@ -29,20 +92,20 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs;
}
-float log_softmax(int n_vocab, const float * logits, int tok) {
+results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
- return logits[tok] - max_logit - log(sum_exp);
+ return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
}
-void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
- double& nll, double& nll2) {
+void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
+ double & nll, double & nll2, float * logit_history, float * prob_history) {
std::mutex mutex;
int counter = 0;
- auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
+ auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
double local_nll = 0, local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
@@ -52,34 +115,43 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
break;
}
lock.unlock();
- double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+ const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+ const double v = -results.log_softmax;
local_nll += v;
local_nll2 += v*v;
+
+ logit_history[i] = results.logit;
+ prob_history[i] = results.prob;
}
};
- for (auto& w : workers) w = std::thread(compute);
+ for (auto & w : workers) w = std::thread(compute);
compute();
- for (auto& w : workers) w.join();
+ for (auto & w : workers) w.join();
}
-void perplexity_v2(llama_context * ctx, const gpt_params & params) {
+results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
- if (params.ppl_stride <= 0) {
- fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
- return;
- }
-
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
- auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+ std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+ std::vector<float> logit_history;
+ std::vector<float> prob_history;
+
+ logit_history.resize(tokens.size());
+ prob_history.resize(tokens.size());
+
+ if (params.ppl_stride <= 0) {
+ fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+ return {tokens, -1, logit_history, prob_history};
+ }
const int calc_chunk = params.n_ctx;
@@ -88,7 +160,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
if (int(tokens.size()) <= calc_chunk) {
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
tokens.size(), params.n_ctx, params.ppl_stride);
- return;
+ return {tokens, -1, logit_history, prob_history};
}
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
@@ -120,7 +192,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
//fprintf(stderr, "%s : failed to eval\n", __func__);
- return;
+ return {tokens, -1, logit_history, prob_history};
}
// save original token and restore it after eval
@@ -161,6 +233,8 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
logits.begin() + (j + 1) * n_vocab);
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+ logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
+ prob_history[start + j + 1] = prob;
nll += -std::log(prob);
++count;
@@ -174,12 +248,14 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
fflush(stdout);
}
printf("\n");
+
+ return {tokens, std::exp(nll / count), logit_history, prob_history};
}
-void perplexity(llama_context * ctx, const gpt_params & params) {
+results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
+
if (params.ppl_stride > 0) {
- perplexity_v2(ctx, params);
- return;
+ return perplexity_v2(ctx, params);
}
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
@@ -193,11 +269,17 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
- auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+ std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+ std::vector<float> logit_history;
+ logit_history.resize(tokens.size());
+
+ std::vector<float> prob_history;
+ prob_history.resize(tokens.size());
+
const int n_chunk_max = tokens.size() / params.n_ctx;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
@@ -236,7 +318,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
- return;
+ return {tokens, -1, logit_history, prob_history};
}
// restore the original token in case it was set to BOS
@@ -272,7 +354,8 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
const int first = std::min(512, params.n_ctx/2);
- process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
+ process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
+ workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += params.n_ctx - first - 1;
// perplexity is e^(average negative log-likelihood)
@@ -287,16 +370,19 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
fflush(stdout);
}
printf("\n");
+
nll2 /= count;
nll /= count;
+ const double ppl = exp(nll);
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
- double ppl = exp(nll);
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
}
+
+ return {tokens, ppl, logit_history, prob_history};
}
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
@@ -604,13 +690,16 @@ int main(int argc, char ** argv) {
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
+ struct results_perplexity results;
if (params.hellaswag) {
hellaswag_score(ctx, params);
} else {
- perplexity(ctx, params);
+ results = perplexity(ctx, params);
}
llama_print_timings(ctx);
+ write_logfile(ctx, params, model, results);
+
llama_free(ctx);
llama_free_model(model);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 89a3311f..b485a5ea 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -719,7 +719,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
- fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
+ fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");