summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp128
1 files changed, 128 insertions, 0 deletions
diff --git a/llama.cpp b/llama.cpp
index fc5dd5cb..52bd718b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1894,6 +1894,31 @@ struct llama_kv_cache {
}
};
+struct llama_control_vector {
+ std::vector<struct ggml_tensor *> tensors; // per layer
+ std::vector<struct ggml_context *> ctxs;
+ std::vector<ggml_backend_buffer_t> bufs;
+
+ int32_t layer_start = -1;
+ int32_t layer_end = -1;
+
+ ggml_tensor * tensor_for(int il) const {
+ if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
+ return nullptr;
+ }
+ return tensors[il];
+ }
+
+ ~llama_control_vector() {
+ for (struct ggml_context * ctx : ctxs) {
+ ggml_free(ctx);
+ }
+ for (ggml_backend_buffer_t buf : bufs) {
+ ggml_backend_buffer_free(buf);
+ }
+ }
+};
+
struct llama_vocab {
using id = int32_t;
using token = std::string;
@@ -2108,6 +2133,9 @@ struct llama_context {
struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
+ // control vectors
+ struct llama_control_vector cvec;
+
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
#endif
@@ -5931,6 +5959,12 @@ struct llm_build_context {
}
cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx0, cur, layer_dir);
+ }
cb(cur, "l_out", il);
// input for next layer
@@ -13366,6 +13400,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
return model->hparams.n_embd;
}
+int32_t llama_n_layer(const struct llama_model * model) {
+ return model->hparams.n_layer;
+}
+
float llama_rope_freq_scale_train(const struct llama_model * model) {
return model->hparams.rope_freq_scale_train;
}
@@ -13465,6 +13503,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
}
}
+static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
+ GGML_ASSERT(cvec.tensors.empty());
+ GGML_ASSERT(cvec.ctxs.empty());
+ GGML_ASSERT(cvec.bufs.empty());
+
+ // count layer buffer types
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+ for (int64_t i = 0; i < model.hparams.n_layer; i++) {
+ buft_layer_count[model.buft_layer[i].buft]++;
+ }
+
+ // allocate contexts
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+ for (auto & it : buft_layer_count) {
+ int n_layers = it.second;
+ struct ggml_init_params params = {
+ /*.mem_size =*/ n_layers * ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+ return 1;
+ }
+ ctx_map[it.first] = ctx;
+ }
+
+ // make tensors
+ cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
+ for (size_t il = 1; il < model.hparams.n_layer; il++) {
+ struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
+ ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
+ cvec.tensors.push_back(tensor);
+ }
+
+ // allocate tensors / buffers and zero
+ for (auto it : ctx_map) {
+ ggml_backend_buffer_type_t buft = it.first;
+ ggml_context * ctx = it.second;
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+ if (!buf) {
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
+ return false;
+ }
+ ggml_backend_buffer_clear(buf, 0);
+ cvec.ctxs.push_back(ctx);
+ cvec.bufs.push_back(buf);
+ }
+
+ return true;
+}
+
+int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
+ const llama_model & model = lctx->model;
+ llama_control_vector & cvec = lctx->cvec;
+
+ if (data == nullptr) {
+ // disable the current control vector (but leave allocated for later)
+ cvec.layer_start = -1;
+ cvec.layer_end = -1;
+ return 0;
+ }
+
+ if (n_embd != (int) model.hparams.n_embd) {
+ LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
+ return 1;
+ }
+
+ if (cvec.tensors.empty()) {
+ if (!llama_control_vector_init(cvec, model)) {
+ return 1;
+ }
+ }
+
+ cvec.layer_start = il_start;
+ cvec.layer_end = il_end;
+
+ for (size_t il = 1; il < model.hparams.n_layer; il++) {
+ assert(cvec.tensors[il] != nullptr);
+
+ const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
+ if (off + n_embd <= len) {
+ ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
+ }
+ }
+
+ return 0;
+}
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
struct llama_kv_cache_view result = {
/*.n_cells = */ 0,