summaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-01-21 14:42:44 +0200
committerGitHub <noreply@github.com>2024-01-21 14:42:44 +0200
commit7dcbe39d36b76389f6c5cd3b151928472b7e22ff (patch)
treed0b13b66cdd5046d5767b4791d183bed3e97c61c /common
parent726c0fa9a2da976e9c5d5c51e185d9dd453fc9e5 (diff)
Add ability to evauate multiple choice tasks (#5047)
* TruthfulQA: 1st attempt, does not look like it is working The same implementation can be used for HellaSwag as well, so I converted a HellaSwag validation dataset to the binary format used here and tested with that. The score is only around 50, so something is not quite right. * TruthfulQA: works but the result is bad I know it works because if I convert the HellaSwag validation data to the binary format used in the truthful_qa_score() function I get the exact same result as from the hellaswag_score() function. But I guess, the questions are tricky and the way I have done the combination of question + answer is very likely not the best. The TruthfulQA validation dataset contains 817 questions, with random chance result around 19%. With this version I get 29.1% for Mistral-7B and 55.2% for Mistral-7B-Instruct-v0.2. The HF leader board results for these two models are 42.2% and 68.3%, respectively. * TruthfulQA: fix random sample * TruthfulQA: prepare tasks in parallel for large test datasets * Rename truthful_qa to multiple_choice * Make MSVC happy I had forgotten that MSVC does not make constexpr's available inside a lambda. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'common')
-rw-r--r--common/common.cpp31
-rw-r--r--common/common.h3
2 files changed, 34 insertions, 0 deletions
diff --git a/common/common.cpp b/common/common.cpp
index ce20360a..0e4b8bab 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -203,6 +203,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.prompt_cache_all = true;
} else if (arg == "--prompt-cache-ro") {
params.prompt_cache_ro = true;
+ } else if (arg == "-bf" || arg == "--binary-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ std::ifstream file(argv[i], std::ios::binary);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ break;
+ }
+ // store the external file name in params
+ params.prompt_file = argv[i];
+ file.seekg(0, std::ios::end);
+ size_t size = file.tellg();
+ file.seekg(0, std::ios::beg);
+ params.prompt.resize(size);
+ file.read((char *)params.prompt.data(), size);
+ fprintf(stderr, "Read %zu bytes from binary file %s\n", size, argv[i]);
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
@@ -689,6 +708,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.winogrande_tasks = std::stoi(argv[i]);
+ } else if (arg == "--multiple-choice") {
+ params.multiple_choice = true;
+ } else if (arg == "--multiple-choice-tasks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.multiple_choice_tasks = std::stoi(argv[i]);
} else if (arg == "--ignore-eos") {
params.ignore_eos = true;
} else if (arg == "--no-penalize-nl") {
@@ -888,6 +915,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
printf(" -f FNAME, --file FNAME\n");
printf(" prompt file to start generation.\n");
+ printf(" -bf FNAME, --binary-file FNAME\n");
+ printf(" binary file containing multiple choice tasks.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -936,6 +965,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
+ printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
+ printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
diff --git a/common/common.h b/common/common.h
index 0ae9c18b..c69ad7e9 100644
--- a/common/common.h
+++ b/common/common.h
@@ -108,6 +108,9 @@ struct gpt_params {
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs