summaryrefslogtreecommitdiff
path: root/examples/cvector-generator/cvector-generator.cpp
diff options
context:
space:
mode:
authorXuan Son Nguyen <thichthat@gmail.com>2024-06-15 18:53:40 +0200
committerGitHub <noreply@github.com>2024-06-15 18:53:40 +0200
commit0c7b3595b9e5ad2355818e259f06b0dc3f0065b3 (patch)
tree1146ce43d46ad84568728a0a78ee5aa79c0e9e20 /examples/cvector-generator/cvector-generator.cpp
parent7b2f4a7d193ef2475259bbe7656fcccfab4b1217 (diff)
Add `cvector-generator` example (#7514)
* add control-vector-generator * calc diff * add comments * proof-of-concept stdlib implementation Implements PCA and file writing using mostly standard libraries. The output is recognized as a functional control vector, but outputs gibberish. * param parsing, refactor, comments Added basic command-line parameters for outfile and one each positive/negative prompt. Refactored some messy code in PCA computation and GGUF exporting. Left a bunch of comments regarding further work needed. * example template completions Implements an example template set built from the positive/negative prompts like the control vector Python implementation. * add multi prompts, multi-thread for PCA * fix mem error * add debugs * fix matrix transpose multiplication you have got to be kidding me * preliminary template/multiprompt support model is running out of context and that ought to be fixed (segfaulting) but other than that it looks goodish * fix zero output & param parsing, functional templating fixed a bug where the output file had no tensor data/was all zero fixed a bug where single hyphen flags were not being correctly parsed implements creation of templated prompts from input (still need to adapt based on model) * fix square_diff matmul index range and CRLF->LF line endings fixed a logic error where square_diff would not multiply all rows fixed a formatting error where the provided completions.txt had CRLF line endings * add command-line args for num threads, num completions file lines, always reload model refactored a few things and did what the commit message says on the tin * code aestheticization * fix compiler warnings * in-series multithreading for prompt embedding? added commented-out code to attempt to start implementing mutlithreading for embedding in main * remove unnecessary multithreading * interim fix memory leak * translated everything but PCA (I think) * tentatively translate the rest * fix ggml errors and make new ones at least it compiles and runs * fix cb_eval * temporary commit while I move dev environments it finally outputs a functioning control vector - "functioning" in the sense that it can be loaded and it clearly has the right idea, but makes the model incoherent * update debug statements * pre-tokenize so we can allocate correct memory to ctx_diffs_wrapped * update comments * (wip) refactor * clean up PCA ggml implementation * fix shape of v_diff_original * add n_batch for pca * working version * remember to copy back the last_eigenvector * fix n_completions * bring back n_completions * default n_pca_batch to 20 * fix macos build * add to makefile all targets * use ggml_format_name * add readme * fix .editorconfig * use ggml_backend_tensor_copy * attemp to fix compile problem on mac * fix compile warn * reuse allocr * move param parser to common * better error handling * clean up a bit * add print_usage * shorten help msg * beautify help msg * escape prompt by default * change compile target to llama-cvector-generator * typo * disable GPU for PCA * code style --------- Co-authored-by: Christian Zhou-Zheng <christianzhouzheng@gmail.com>
Diffstat (limited to 'examples/cvector-generator/cvector-generator.cpp')
-rw-r--r--examples/cvector-generator/cvector-generator.cpp499
1 files changed, 499 insertions, 0 deletions
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
new file mode 100644
index 00000000..9941683d
--- /dev/null
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -0,0 +1,499 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+#include "pca.hpp"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cstdio>
+#include <string>
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <climits>
+
+
+//////////////////////////////////////////////////
+// utils
+
+template <class Iter>
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+ std::string ret;
+ for (; begin != end; ++begin) {
+ ret += llama_token_to_piece(ctx, *begin);
+ }
+
+ return ret;
+}
+
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ printf("\nexample usage:\n");
+ printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
+ printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
+ printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
+ printf("\n");
+}
+
+//////////////////////////////////////////////////
+
+
+// cb_eval is reused for each pair of positive - negative prompt
+struct callback_data {
+ ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
+
+ int n_layers = 0;
+ int n_tokens = 0;
+ bool is_eval_pos = true;
+
+ // each element of the vector correspond to one layer
+ std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
+ std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
+ std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
+
+ // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
+ void save_tensor_for_layer(struct ggml_tensor * t) {
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
+
+ if (ctx_ggml == nullptr) {
+ // alloc a new ctx_ggml if needed
+ struct ggml_init_params params_ggml = {
+ /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ctx_ggml = ggml_init(params_ggml);
+ }
+
+ // copy tensor data
+ auto n_bytes = ggml_nbytes(t);
+ struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
+ t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
+ ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
+ ggml_set_name(t_layer, ggml_get_name(t));
+ //print_debug_tensor(t_layer);
+
+ if (is_eval_pos) {
+ v_pos.push_back(t_layer);
+ } else {
+ v_neg.push_back(t_layer);
+ }
+ }
+
+ // calculate diff (v_pos - v_neg) and place the result back to v_pos
+ // all zero rows in the diff tensor will also be removed
+ // NOTE: final layer is ignored. we only have (n_layers - 1) to process
+ std::vector<struct ggml_tensor *> calc_diff() {
+ for (float il = 0; il < v_pos.size(); il++) {
+ float * a = (float *) v_pos[il]->data;
+ float * b = (float *) v_neg[il]->data;
+ size_t n_elem = ggml_nelements(v_pos[il]);
+ for (size_t j = 0; j < n_elem; j++) {
+ a[j] -= b[j];
+ }
+ //print_debug_tensor(v_pos[i]);
+ auto diff_filtered = filter_nonzero_rows(v_pos[il]);
+ v_diff_filtered.push_back(diff_filtered);
+ }
+ return v_diff_filtered; // for convinient, we return the result std::vector
+ }
+
+ // delete zero rows from a given 2D tensor
+ struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
+ //printf("filter_nonzero_rows\n");
+ auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
+ // check if given row containing all zero elements
+ int n_cols = t->ne[0]; // hint: should be equal to n_embd
+ for (int col = 0; col < n_cols; ++col) {
+ if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
+ return false;
+ }
+ }
+ return true;
+ };
+ std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
+ for (int i_row = 0; i_row < a->ne[1]; i_row++) {
+ if (!is_row_all_zeros(a, i_row, 1e-6)) {
+ rows_to_copy.push_back(i_row);
+ }
+ }
+
+ // get "n_nonzero_rows" for the output "diff_filtered"
+ int n_nonzero_rows = rows_to_copy.size();
+ //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
+ int n_embd = a->ne[0];
+ GGML_ASSERT(n_nonzero_rows > 0);
+
+ // diff_filtered: [n_embd, n_nonzero_rows]
+ struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
+ ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
+ ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
+ diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
+
+ // copy non-zero rows
+ for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
+ int src_row = rows_to_copy[dest_row];
+ for (int i = 0; i < n_embd; i++) {
+ float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
+ ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
+ }
+ }
+
+ //print_debug_tensor(diff_filtered);
+
+ return diff_filtered;
+ }
+
+ // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
+ void reset() {
+ for (auto ptr : v_pos) free(ptr->data);
+ for (auto ptr : v_neg) free(ptr->data);
+ for (auto ptr : v_diff_filtered) free(ptr->data);
+ v_pos.clear();
+ v_neg.clear();
+ v_diff_filtered.clear();
+ if (ctx_ggml) {
+ ggml_free(ctx_ggml);
+ }
+ ctx_ggml = nullptr;
+ }
+};
+
+/**
+ * process_ctx is used to store the ggml context for pre-post processing the diff vectors
+ * in short, input => v_diff and output => v_final
+ */
+struct train_context {
+ ggml_context * ctx_ggml;
+ int n_embd;
+ int n_layers;
+
+ /* pair of prompts to be used for generating final vector */
+ std::vector<std::string> positive_entries;
+ std::vector<std::string> negative_entries;
+
+ // each element of the vector correspond to one layer
+ // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
+ // NOTE (2): v_diff is transposed from v_diff_tmp
+ std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
+ std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
+
+ // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
+ // v_diff_tmp will get converted unto v_diff later on
+ std::vector<std::vector<uint8_t>> v_diff_tmp;
+
+ train_context(int n_embd_, int n_layers_) {
+ n_embd = n_embd_;
+ n_layers = n_layers_;
+ struct ggml_init_params params_ggml = {
+ /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ctx_ggml = ggml_init(params_ggml);
+ for (int il = 0; il < n_layers - 1; il++) {
+ std::vector<uint8_t> empty;
+ v_diff_tmp.push_back(empty);
+ auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
+ t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
+ v_final.push_back(t);
+ }
+ }
+
+ // add new rows into existing tensor in v_diff_tmp
+ void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
+ GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
+ for (int il = 0; il < n_layers - 1; il++) {
+ auto t = diff_filtered[il];
+ auto & diff_tmp = v_diff_tmp[il];
+ size_t curr_size = diff_tmp.size();
+ diff_tmp.resize(curr_size + ggml_nbytes(t));
+ memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
+ }
+ }
+
+ // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
+ // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
+ void build_v_diff() {
+ printf("build_v_diff\n");
+ for (int il = 0; il < n_layers - 1; il++) {
+ auto & diff_tmp = v_diff_tmp[il];
+ int n_elem = diff_tmp.size() / sizeof(float);
+ GGML_ASSERT(n_elem % n_embd == 0);
+ int n_rows = n_elem / n_embd;
+ struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
+ ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
+ // copy data & transpose
+ diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
+ float * arr = (float *) diff_tmp.data();
+ for (int ir = 0; ir < n_rows; ++ir) {
+ for (int ic = 0; ic < n_embd; ++ic) {
+ float f = arr[ir*n_embd + ic];
+ ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+ }
+ }
+ v_diff.push_back(diff);
+ print_debug_tensor(diff);
+ // free memory of diff_tmp
+ diff_tmp.resize(0);
+ }
+ }
+
+ ~train_context() {
+ for (auto ptr : v_final) free(ptr->data);
+ for (auto ptr : v_diff) free(ptr->data);
+ // no need to free v_diff_tmp, since we didn't use malloc
+ ggml_free(ctx_ggml);
+ }
+};
+
+struct tokenized_prompt {
+ std::vector<llama_token> tokens_pos;
+ std::vector<llama_token> tokens_neg;
+ size_t max_seq_len;
+
+ tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+ tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
+ tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
+ max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
+ padding_seq(ctx, tokens_pos, max_seq_len);
+ padding_seq(ctx, tokens_neg, max_seq_len);
+ }
+
+ void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
+ // TODO: customize padding token
+ std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
+ llama_token pad_tok = pad_tokens.back();
+ while (tokens.size() < len) {
+ tokens.push_back(pad_tok);
+ }
+ }
+};
+
+//////////////////////////////////////////////////
+
+template <typename T>
+static std::string to_string(const T & val) {
+ std::stringstream ss;
+ ss << val;
+ return ss.str();
+}
+
+static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
+ std::vector<std::string> output;
+ std::ifstream file(path);
+ if (!file.is_open()) {
+ fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
+ exit(1);
+ }
+ std::string line;
+ while (std::getline(file, line)) {
+ bool is_skip = skip_empty_lines && line.empty();
+ if (!is_skip) {
+ string_process_escapes(line);
+ output.push_back(line);
+ }
+ }
+ file.close();
+ return output;
+}
+
+//////////////////////////////////////////////////
+
+static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+ auto * cb_data = (callback_data *) user_data;
+ static const char * l_out_name = "l_out";
+ const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
+
+ if (ask) {
+ return is_l_out;
+ }
+
+ if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
+ return true;
+ }
+
+ // save the tensor to current context
+ cb_data->save_tensor_for_layer(t);
+ return true;
+}
+
+static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
+ llama_kv_cache_clear(ctx);
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return false;
+ }
+ return true;
+}
+
+static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
+ struct gguf_context * ctx = gguf_init_empty();
+
+ const std::string arch = "controlvector";
+ gguf_set_val_str(ctx, "general.architecture", arch.c_str());
+ gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
+ gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
+
+ for (size_t i = 0; i < v_ctrl.size(); ++i) {
+ gguf_add_tensor(ctx, v_ctrl[i]);
+ print_debug_tensor(v_ctrl[i]);
+ printf("Added tensor: %s\n", v_ctrl[i]->name);
+ }
+
+ printf("%s: writing file...\n", __func__);
+ gguf_write_to_file(ctx, fname.c_str(), false);
+ printf("%s: wrote file '%s'\n", __func__, fname.c_str());
+ gguf_free(ctx);
+}
+
+/**
+ * Load prompt files and completion file.
+ * Then format each pair of prompt + completion to make an entry.
+ */
+static int prepare_entries(gpt_params & params, train_context & ctx_train) {
+ // load prompts
+ std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
+ std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
+ if (positive_prompts.size() != negative_prompts.size()) {
+ fprintf(stderr, "number of positive and negative prompts must be equal\n");
+ return 1;
+ }
+ if (positive_prompts.empty()) {
+ fprintf(stderr, "must provide at least one prompt pair\n");
+ return 1;
+ }
+
+ // create templated prompts
+ std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
+ auto format_template = [](std::string persona, std::string suffix) {
+ // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
+ return persona + " " + suffix;
+ };
+ for (size_t i = 0; i < positive_prompts.size(); ++i) {
+ for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
+ // TODO replicate the truncations done by the python implementation
+ ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
+ ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
+ }
+ }
+ return 0;
+}
+
+int main(int argc, char ** argv) {
+ gpt_params params;
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
+ }
+
+ if (params.n_pca_iterations % params.n_pca_batch != 0) {
+ fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
+ return 1;
+ }
+
+
+ callback_data cb_data;
+
+ // pass the callback to the backend scheduler
+ // it will be executed for each node during the graph computation
+ params.cb_eval = cb_eval;
+ params.cb_eval_user_data = &cb_data;
+ params.warmup = false;
+
+ print_build_info();
+ llama_backend_init();
+ llama_numa_init(params.numa);
+
+ // load the model to get hparams
+ llama_model * model;
+ llama_context * ctx;
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+
+ // int n_ctx = llama_n_ctx(ctx);
+ int n_layers = llama_n_layer(model);
+ int n_embd = llama_n_embd(model);
+ // get model hint param (a.k.a model arch name)
+ char model_hint[128];
+ llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
+
+ // init train_context
+ train_context ctx_train(n_embd, n_layers);
+
+ // load and prepare entries for training
+ prepare_entries(params, ctx_train);
+
+ // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
+ std::vector<tokenized_prompt> tokenized_prompts;
+ size_t n_total_tokens = 0;
+ for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+ tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
+ n_total_tokens += 2 * t.max_seq_len;
+ tokenized_prompts.push_back(std::move(t));
+ }
+
+ std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
+
+ for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+ bool success = false;
+ tokenized_prompt t = tokenized_prompts[i];
+ cb_data.n_layers = n_layers;
+ cb_data.n_tokens = t.max_seq_len;
+
+ printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
+ (int) i+1, (int) ctx_train.positive_entries.size(),
+ tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
+ tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
+ (int) t.max_seq_len);
+
+ cb_data.is_eval_pos = true;
+ success = get_hidden_layers(ctx, t.tokens_pos);
+ if (!success) break;
+
+ cb_data.is_eval_pos = false;
+ success = get_hidden_layers(ctx, t.tokens_neg);
+ if (!success) break;
+
+ // calculate diff and remove all zero rows
+ auto v_diff_filtered = cb_data.calc_diff();
+
+ // save & concat the filtered v_diff to ctx_train
+ ctx_train.concat_diff_tmp(v_diff_filtered);
+
+ // reset for next iteration
+ cb_data.reset();
+ }
+
+ // done with the model, we can now free it to make gain some memory
+ printf("Done evaluate prompts, unload model...\n");
+ llama_free(ctx);
+ llama_free_model(model);
+
+ // prepare ctx_train for PCA
+ ctx_train.build_v_diff();
+
+ // run PCA
+ PCA::pca_params pca_params;
+ pca_params.n_threads = params.n_threads;
+ pca_params.n_batch = params.n_pca_batch;
+ pca_params.n_iterations = params.n_pca_iterations;
+ PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
+
+ // write output vectors to gguf
+ export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
+
+ llama_backend_free();
+
+ return 0;
+}