diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 128 |
1 files changed, 128 insertions, 0 deletions
@@ -1894,6 +1894,31 @@ struct llama_kv_cache { } }; +struct llama_control_vector { + std::vector<struct ggml_tensor *> tensors; // per layer + std::vector<struct ggml_context *> ctxs; + std::vector<ggml_backend_buffer_t> bufs; + + int32_t layer_start = -1; + int32_t layer_end = -1; + + ggml_tensor * tensor_for(int il) const { + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { + return nullptr; + } + return tensors[il]; + } + + ~llama_control_vector() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2108,6 +2133,9 @@ struct llama_context { struct ggml_tensor * inp_s_mask; // F32 [1, kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] + // control vectors + struct llama_control_vector cvec; + #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -5931,6 +5959,12 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } cb(cur, "l_out", il); // input for next layer @@ -13366,6 +13400,10 @@ int32_t llama_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } +int32_t llama_n_layer(const struct llama_model * model) { + return model->hparams.n_layer; +} + float llama_rope_freq_scale_train(const struct llama_model * model) { return model->hparams.rope_freq_scale_train; } @@ -13465,6 +13503,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); + + // count layer buffer types + std::map<ggml_backend_buffer_type_t, int> buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; + } + + // allocate contexts + std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ n_layers * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + return 1; + } + ctx_map[it.first] = ctx; + } + + // make tensors + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + for (size_t il = 1; il < model.hparams.n_layer; il++) { + struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft); + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + cvec.tensors.push_back(tensor); + } + + // allocate tensors / buffers and zero + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + cvec.ctxs.push_back(ctx); + cvec.bufs.push_back(buf); + } + + return true; +} + +int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { + const llama_model & model = lctx->model; + llama_control_vector & cvec = lctx->cvec; + + if (data == nullptr) { + // disable the current control vector (but leave allocated for later) + cvec.layer_start = -1; + cvec.layer_end = -1; + return 0; + } + + if (n_embd != (int) model.hparams.n_embd) { + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); + return 1; + } + + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; + } + } + + cvec.layer_start = il_start; + cvec.layer_end = il_end; + + for (size_t il = 1; il < model.hparams.n_layer; il++) { + assert(cvec.tensors[il] != nullptr); + + const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + if (off + n_embd <= len) { + ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); + } + } + + return 0; +} + struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, |