diff options
author | Xuan Son Nguyen <thichthat@gmail.com> | 2024-06-15 18:53:40 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-15 18:53:40 +0200 |
commit | 0c7b3595b9e5ad2355818e259f06b0dc3f0065b3 (patch) | |
tree | 1146ce43d46ad84568728a0a78ee5aa79c0e9e20 /examples/cvector-generator/cvector-generator.cpp | |
parent | 7b2f4a7d193ef2475259bbe7656fcccfab4b1217 (diff) |
Add `cvector-generator` example (#7514)
* add control-vector-generator
* calc diff
* add comments
* proof-of-concept stdlib implementation
Implements PCA and file writing using mostly standard libraries. The output is recognized as a functional control vector, but outputs gibberish.
* param parsing, refactor, comments
Added basic command-line parameters for outfile and one each positive/negative prompt.
Refactored some messy code in PCA computation and GGUF exporting.
Left a bunch of comments regarding further work needed.
* example template completions
Implements an example template set built from the positive/negative prompts like the control vector Python implementation.
* add multi prompts, multi-thread for PCA
* fix mem error
* add debugs
* fix matrix transpose multiplication
you have got to be kidding me
* preliminary template/multiprompt support
model is running out of context and that ought to be fixed (segfaulting) but other than that it looks goodish
* fix zero output & param parsing, functional templating
fixed a bug where the output file had no tensor data/was all zero
fixed a bug where single hyphen flags were not being correctly parsed
implements creation of templated prompts from input (still need to adapt based on model)
* fix square_diff matmul index range and CRLF->LF line endings
fixed a logic error where square_diff would not multiply all rows
fixed a formatting error where the provided completions.txt had CRLF line endings
* add command-line args for num threads, num completions file lines, always reload model
refactored a few things and did what the commit message says on the tin
* code aestheticization
* fix compiler warnings
* in-series multithreading for prompt embedding?
added commented-out code to attempt to start implementing mutlithreading for embedding in main
* remove unnecessary multithreading
* interim fix memory leak
* translated everything but PCA (I think)
* tentatively translate the rest
* fix ggml errors and make new ones
at least it compiles and runs
* fix cb_eval
* temporary commit while I move dev environments
it finally outputs a functioning control vector - "functioning" in the sense that it can be loaded and it clearly has the right idea, but makes the model incoherent
* update debug statements
* pre-tokenize so we can allocate correct memory to ctx_diffs_wrapped
* update comments
* (wip) refactor
* clean up PCA ggml implementation
* fix shape of v_diff_original
* add n_batch for pca
* working version
* remember to copy back the last_eigenvector
* fix n_completions
* bring back n_completions
* default n_pca_batch to 20
* fix macos build
* add to makefile all targets
* use ggml_format_name
* add readme
* fix .editorconfig
* use ggml_backend_tensor_copy
* attemp to fix compile problem on mac
* fix compile warn
* reuse allocr
* move param parser to common
* better error handling
* clean up a bit
* add print_usage
* shorten help msg
* beautify help msg
* escape prompt by default
* change compile target to llama-cvector-generator
* typo
* disable GPU for PCA
* code style
---------
Co-authored-by: Christian Zhou-Zheng <christianzhouzheng@gmail.com>
Diffstat (limited to 'examples/cvector-generator/cvector-generator.cpp')
-rw-r--r-- | examples/cvector-generator/cvector-generator.cpp | 499 |
1 files changed, 499 insertions, 0 deletions
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp new file mode 100644 index 00000000..9941683d --- /dev/null +++ b/examples/cvector-generator/cvector-generator.cpp @@ -0,0 +1,499 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" +#include "pca.hpp" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include <cstdio> +#include <string> +#include <tuple> +#include <vector> +#include <algorithm> +#include <iostream> +#include <fstream> +#include <climits> + + +////////////////////////////////////////////////// +// utils + +template <class Iter> +static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { + std::string ret; + for (; begin != end; ++begin) { + ret += llama_token_to_piece(ctx, *begin); + } + + return ret; +} + +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + printf("\nexample usage:\n"); + printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]); + printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]); + printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]); + printf("\n"); +} + +////////////////////////////////////////////////// + + +// cb_eval is reused for each pair of positive - negative prompt +struct callback_data { + ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered + + int n_layers = 0; + int n_tokens = 0; + bool is_eval_pos = true; + + // each element of the vector correspond to one layer + std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens] + std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens] + std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer + + // save a tensor into either v_pos or v_neg (decided by is_eval_pos) + void save_tensor_for_layer(struct ggml_tensor * t) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + + if (ctx_ggml == nullptr) { + // alloc a new ctx_ggml if needed + struct ggml_init_params params_ggml = { + /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_ggml = ggml_init(params_ggml); + } + + // copy tensor data + auto n_bytes = ggml_nbytes(t); + struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]); + t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow + ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); + ggml_set_name(t_layer, ggml_get_name(t)); + //print_debug_tensor(t_layer); + + if (is_eval_pos) { + v_pos.push_back(t_layer); + } else { + v_neg.push_back(t_layer); + } + } + + // calculate diff (v_pos - v_neg) and place the result back to v_pos + // all zero rows in the diff tensor will also be removed + // NOTE: final layer is ignored. we only have (n_layers - 1) to process + std::vector<struct ggml_tensor *> calc_diff() { + for (float il = 0; il < v_pos.size(); il++) { + float * a = (float *) v_pos[il]->data; + float * b = (float *) v_neg[il]->data; + size_t n_elem = ggml_nelements(v_pos[il]); + for (size_t j = 0; j < n_elem; j++) { + a[j] -= b[j]; + } + //print_debug_tensor(v_pos[i]); + auto diff_filtered = filter_nonzero_rows(v_pos[il]); + v_diff_filtered.push_back(diff_filtered); + } + return v_diff_filtered; // for convinient, we return the result std::vector + } + + // delete zero rows from a given 2D tensor + struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { + //printf("filter_nonzero_rows\n"); + auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { + // check if given row containing all zero elements + int n_cols = t->ne[0]; // hint: should be equal to n_embd + for (int col = 0; col < n_cols; ++col) { + if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) { + return false; + } + } + return true; + }; + std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered) + for (int i_row = 0; i_row < a->ne[1]; i_row++) { + if (!is_row_all_zeros(a, i_row, 1e-6)) { + rows_to_copy.push_back(i_row); + } + } + + // get "n_nonzero_rows" for the output "diff_filtered" + int n_nonzero_rows = rows_to_copy.size(); + //printf("n_nonzero_rows: %d\n", n_nonzero_rows); + int n_embd = a->ne[0]; + GGML_ASSERT(n_nonzero_rows > 0); + + // diff_filtered: [n_embd, n_nonzero_rows] + struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( + ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); + ggml_format_name(diff_filtered, "diff_filtered_%s", a->name); + diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); + + // copy non-zero rows + for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { + int src_row = rows_to_copy[dest_row]; + for (int i = 0; i < n_embd; i++) { + float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0); + ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem); + } + } + + //print_debug_tensor(diff_filtered); + + return diff_filtered; + } + + // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors + void reset() { + for (auto ptr : v_pos) free(ptr->data); + for (auto ptr : v_neg) free(ptr->data); + for (auto ptr : v_diff_filtered) free(ptr->data); + v_pos.clear(); + v_neg.clear(); + v_diff_filtered.clear(); + if (ctx_ggml) { + ggml_free(ctx_ggml); + } + ctx_ggml = nullptr; + } +}; + +/** + * process_ctx is used to store the ggml context for pre-post processing the diff vectors + * in short, input => v_diff and output => v_final + */ +struct train_context { + ggml_context * ctx_ggml; + int n_embd; + int n_layers; + + /* pair of prompts to be used for generating final vector */ + std::vector<std::string> positive_entries; + std::vector<std::string> negative_entries; + + // each element of the vector correspond to one layer + // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here + // NOTE (2): v_diff is transposed from v_diff_tmp + std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) + std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file + + // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor + // v_diff_tmp will get converted unto v_diff later on + std::vector<std::vector<uint8_t>> v_diff_tmp; + + train_context(int n_embd_, int n_layers_) { + n_embd = n_embd_; + n_layers = n_layers_; + struct ggml_init_params params_ggml = { + /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_ggml = ggml_init(params_ggml); + for (int il = 0; il < n_layers - 1; il++) { + std::vector<uint8_t> empty; + v_diff_tmp.push_back(empty); + auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); + t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible + v_final.push_back(t); + } + } + + // add new rows into existing tensor in v_diff_tmp + void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) { + GGML_ASSERT((int) diff_filtered.size() == n_layers - 1); + for (int il = 0; il < n_layers - 1; il++) { + auto t = diff_filtered[il]; + auto & diff_tmp = v_diff_tmp[il]; + size_t curr_size = diff_tmp.size(); + diff_tmp.resize(curr_size + ggml_nbytes(t)); + memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t)); + } + } + + // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) + // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method + void build_v_diff() { + printf("build_v_diff\n"); + for (int il = 0; il < n_layers - 1; il++) { + auto & diff_tmp = v_diff_tmp[il]; + int n_elem = diff_tmp.size() / sizeof(float); + GGML_ASSERT(n_elem % n_embd == 0); + int n_rows = n_elem / n_embd; + struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd); + ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); + // copy data & transpose + diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible + float * arr = (float *) diff_tmp.data(); + for (int ir = 0; ir < n_rows; ++ir) { + for (int ic = 0; ic < n_embd; ++ic) { + float f = arr[ir*n_embd + ic]; + ggml_set_f32_nd(diff, ir, ic, 0, 0, f); + } + } + v_diff.push_back(diff); + print_debug_tensor(diff); + // free memory of diff_tmp + diff_tmp.resize(0); + } + } + + ~train_context() { + for (auto ptr : v_final) free(ptr->data); + for (auto ptr : v_diff) free(ptr->data); + // no need to free v_diff_tmp, since we didn't use malloc + ggml_free(ctx_ggml); + } +}; + +struct tokenized_prompt { + std::vector<llama_token> tokens_pos; + std::vector<llama_token> tokens_neg; + size_t max_seq_len; + + tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + tokens_pos = ::llama_tokenize(ctx, pos, add_bos); + tokens_neg = ::llama_tokenize(ctx, neg, add_bos); + max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); + padding_seq(ctx, tokens_pos, max_seq_len); + padding_seq(ctx, tokens_neg, max_seq_len); + } + + void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) { + // TODO: customize padding token + std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false); + llama_token pad_tok = pad_tokens.back(); + while (tokens.size() < len) { + tokens.push_back(pad_tok); + } + } +}; + +////////////////////////////////////////////////// + +template <typename T> +static std::string to_string(const T & val) { + std::stringstream ss; + ss << val; + return ss.str(); +} + +static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) { + std::vector<std::string> output; + std::ifstream file(path); + if (!file.is_open()) { + fprintf(stderr, "error: unable to open file: %s\n", path.c_str()); + exit(1); + } + std::string line; + while (std::getline(file, line)) { + bool is_skip = skip_empty_lines && line.empty(); + if (!is_skip) { + string_process_escapes(line); + output.push_back(line); + } + } + file.close(); + return output; +} + +////////////////////////////////////////////////// + +static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { + auto * cb_data = (callback_data *) user_data; + static const char * l_out_name = "l_out"; + const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; + + if (ask) { + return is_l_out; + } + + if (!is_l_out || t->ne[1] != cb_data->n_tokens) { + return true; + } + + // save the tensor to current context + cb_data->save_tensor_for_layer(t); + return true; +} + +static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) { + llama_kv_cache_clear(ctx); + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + return true; +} + +static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) { + struct gguf_context * ctx = gguf_init_empty(); + + const std::string arch = "controlvector"; + gguf_set_val_str(ctx, "general.architecture", arch.c_str()); + gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); + gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size()); + + for (size_t i = 0; i < v_ctrl.size(); ++i) { + gguf_add_tensor(ctx, v_ctrl[i]); + print_debug_tensor(v_ctrl[i]); + printf("Added tensor: %s\n", v_ctrl[i]->name); + } + + printf("%s: writing file...\n", __func__); + gguf_write_to_file(ctx, fname.c_str(), false); + printf("%s: wrote file '%s'\n", __func__, fname.c_str()); + gguf_free(ctx); +} + +/** + * Load prompt files and completion file. + * Then format each pair of prompt + completion to make an entry. + */ +static int prepare_entries(gpt_params & params, train_context & ctx_train) { + // load prompts + std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); + std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); + if (positive_prompts.size() != negative_prompts.size()) { + fprintf(stderr, "number of positive and negative prompts must be equal\n"); + return 1; + } + if (positive_prompts.empty()) { + fprintf(stderr, "must provide at least one prompt pair\n"); + return 1; + } + + // create templated prompts + std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); + auto format_template = [](std::string persona, std::string suffix) { + // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" + return persona + " " + suffix; + }; + for (size_t i = 0; i < positive_prompts.size(); ++i) { + for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { + // TODO replicate the truncations done by the python implementation + ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); + ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); + } + } + return 0; +} + +int main(int argc, char ** argv) { + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; + } + + if (params.n_pca_iterations % params.n_pca_batch != 0) { + fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n"); + return 1; + } + + + callback_data cb_data; + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = cb_eval; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + print_build_info(); + llama_backend_init(); + llama_numa_init(params.numa); + + // load the model to get hparams + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + + // int n_ctx = llama_n_ctx(ctx); + int n_layers = llama_n_layer(model); + int n_embd = llama_n_embd(model); + // get model hint param (a.k.a model arch name) + char model_hint[128]; + llama_model_meta_val_str(model, "general.architecture", model_hint, 128); + + // init train_context + train_context ctx_train(n_embd, n_layers); + + // load and prepare entries for training + prepare_entries(params, ctx_train); + + // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped + std::vector<tokenized_prompt> tokenized_prompts; + size_t n_total_tokens = 0; + for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { + tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]); + n_total_tokens += 2 * t.max_seq_len; + tokenized_prompts.push_back(std::move(t)); + } + + std::cout << "n_total_tokens: " << n_total_tokens << std::endl; + + for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { + bool success = false; + tokenized_prompt t = tokenized_prompts[i]; + cb_data.n_layers = n_layers; + cb_data.n_tokens = t.max_seq_len; + + printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", + (int) i+1, (int) ctx_train.positive_entries.size(), + tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), + tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), + (int) t.max_seq_len); + + cb_data.is_eval_pos = true; + success = get_hidden_layers(ctx, t.tokens_pos); + if (!success) break; + + cb_data.is_eval_pos = false; + success = get_hidden_layers(ctx, t.tokens_neg); + if (!success) break; + + // calculate diff and remove all zero rows + auto v_diff_filtered = cb_data.calc_diff(); + + // save & concat the filtered v_diff to ctx_train + ctx_train.concat_diff_tmp(v_diff_filtered); + + // reset for next iteration + cb_data.reset(); + } + + // done with the model, we can now free it to make gain some memory + printf("Done evaluate prompts, unload model...\n"); + llama_free(ctx); + llama_free_model(model); + + // prepare ctx_train for PCA + ctx_train.build_v_diff(); + + // run PCA + PCA::pca_params pca_params; + pca_params.n_threads = params.n_threads; + pca_params.n_batch = params.n_pca_batch; + pca_params.n_iterations = params.n_pca_iterations; + PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); + + // write output vectors to gguf + export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); + + llama_backend_free(); + + return 0; +} |