diff options
author | slaren <slarengh@gmail.com> | 2023-07-26 15:56:53 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-26 15:56:53 +0200 |
commit | 5488fb789ea5692268309baa76f67598155060be (patch) | |
tree | 0609924b82a47ce97806b9201b75ec49bdf76c31 /llama.cpp | |
parent | eb542d39324574a6778fad9ba9e34ba7a14a82a3 (diff) |
ggml : allocate graphs in a context (#2392)
* ggml : graph allocation in contexts
* allocate work buffer as a ggml_object in ggml_graph_compute_with_ctx
* llama.cpp : allocate graph in the context
* add GGML_PAD
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 30 |
1 files changed, 15 insertions, 15 deletions
@@ -1424,7 +1424,7 @@ static bool llama_eval_internal( struct ggml_context * ctx0 = ggml_init(params); - ggml_cgraph gf = {}; + ggml_cgraph * gf = ggml_new_graph(ctx0); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance @@ -1541,8 +1541,8 @@ static bool llama_eval_internal( ggml_set_name(v, "v"); // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = @@ -1712,21 +1712,21 @@ static bool llama_eval_internal( //cur = ggml_soft_max_inplace(ctx0, cur); // run the computation - ggml_build_forward_expand(&gf, cur); + ggml_build_forward_expand(gf, cur); // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); #if GGML_USE_MPI - ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer); + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { if (!ggml_metal_if_optimized(lctx.ctx_metal)) { - ggml_metal_graph_find_concurrency(lctx.ctx_metal,&gf); + ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf); } ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); - ggml_metal_graph_compute(lctx.ctx_metal, &gf); + ggml_metal_graph_compute(lctx.ctx_metal, gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); } else { // IMPORTANT: @@ -1745,34 +1745,34 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); + ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); } #else - ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); + ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); #endif #if GGML_USE_MPI - ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer); + ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); #endif // update kv token count lctx.kv_self.n = n_past + N; - struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1]; + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; if (cgraph_fname) { - ggml_graph_export(&gf, cgraph_fname); + ggml_graph_export(gf, cgraph_fname); } #ifdef GGML_PERF // print timing information per ggml operation (for debugging purposes) // requires GGML_PERF to be defined - ggml_graph_print(&gf); + ggml_graph_print(gf); #endif // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { - // ggml_graph_dump_dot(&gf, NULL, "llama.dot"); + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} // extract logits @@ -3177,7 +3177,7 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); |