From 1d656d6360359cfdaaf5d64ed9690047b600dbcb Mon Sep 17 00:00:00 2001 From: Qingyou Meng Date: Sat, 8 Jul 2023 00:24:01 +0800 Subject: ggml : change ggml_graph_compute() API to not require context (#1999) * ggml_graph_compute: deprecate using ggml_context, try resolve issue #287 * rewrite: no longer consider backward compitability; plan and make_plan * minor: rename ctx as plan; const * remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward * add static ggml_graph_compute_sugar() * minor: update comments * reusable buffers * ggml : more consistent naming + metal fixes * ggml : fix docs * tests : disable grad / opt + minor naming changes * ggml : add ggml_graph_compute_with_ctx() - backwards compatible API - deduplicates a lot of copy-paste * ci : enable test-grad0 * examples : factor out plan allocation into a helper function * llama : factor out plan stuff into a helper function * ci : fix env * llama : fix duplicate symbols + refactor example benchmark * ggml : remove obsolete assert + refactor n_tasks section * ggml : fix indentation in switch * llama : avoid unnecessary bool * ggml : remove comments from source file and match order in header --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 02afdeb1..ee6ec092 100644 --- a/llama.cpp +++ b/llama.cpp @@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } +// +// ggml helpers +// + +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +// +// memory sizes +// + static const std::map & MEM_REQ_SCRATCH0() { static std::map k_sizes = { @@ -321,6 +340,9 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; + // memory buffers used to evaluate the model // TODO: move in llama_state llama_ctx_buffer buf_compute; @@ -758,7 +780,6 @@ struct llama_model_loader { }; - // // kv cache // @@ -1265,7 +1286,7 @@ static bool llama_eval_internal( const float * embd, const int n_tokens, const int n_past, - const int n_threads, + int n_threads, const char * cgraph_fname) { LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); @@ -1306,10 +1327,11 @@ static bool llama_eval_internal( struct ggml_context * ctx0 = ggml_init(params); + ggml_cgraph gf = {}; + // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -1593,6 +1615,7 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { + ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); } else { @@ -1612,10 +1635,10 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); } #else - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); #endif if (cgraph_fname) { @@ -2575,8 +2598,8 @@ void llama_free_model(struct llama_model * model) { } struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params) { + struct llama_model * model, + struct llama_context_params params) { if (!model) { return nullptr; @@ -2645,7 +2668,7 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_METAL if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(); + ctx->ctx_metal = ggml_metal_init(1); void * data_ptr = NULL; size_t data_size = 0; @@ -2802,6 +2825,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // read tensors and apply bool warned = false; int n_tensors = 0; + + std::vector work_buffer; + while (true) { int32_t n_dims; int32_t length; @@ -2966,8 +2992,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const } struct ggml_cgraph gf = ggml_build_forward(r); - gf.n_threads = n_threads; - ggml_graph_compute(lora_ctx, &gf); + + ggml_graph_compute_helper(work_buffer, &gf, n_threads); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3120,7 +3146,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kout3d->data = out; @@ -3140,7 +3165,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } @@ -3226,7 +3251,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kin3d->data = (void *) inp; @@ -3246,7 +3270,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } -- cgit v1.2.3