diff options
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 40 |
1 files changed, 21 insertions, 19 deletions
@@ -91,6 +91,8 @@ #define LLAMA_ATTRIBUTE_FORMAT(...) #endif +#define LLAMA_MAX_NODES 4096 + // // logging // @@ -3618,7 +3620,7 @@ struct llm_build_context { } struct ggml_cgraph * build_llama() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3730,7 +3732,7 @@ struct llm_build_context { } struct ggml_cgraph * build_baichuan() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -3850,7 +3852,7 @@ struct llm_build_context { } struct ggml_cgraph * build_falcon() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -3972,7 +3974,7 @@ struct llm_build_context { } struct ggml_cgraph * build_starcoder() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_tensor * cur; struct ggml_tensor * pos; @@ -4071,7 +4073,7 @@ struct llm_build_context { } struct ggml_cgraph * build_persimmon() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_rot = n_embd_head / 2; @@ -4281,7 +4283,7 @@ struct llm_build_context { } struct ggml_cgraph * build_refact() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4372,7 +4374,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bloom() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4466,7 +4468,7 @@ struct llm_build_context { } struct ggml_cgraph * build_mpt() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -8208,7 +8210,7 @@ struct llama_context * llama_new_context_with_model( { static const size_t tensor_alignment = 32; // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data - ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); // create measure allocator ctx->alloc = ggml_allocr_new_measure(tensor_alignment); @@ -8597,8 +8599,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat if (kv_buf_size) { const size_t elt_size = ggml_element_size(kv_self.k); - ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); - ggml_cgraph gf{}; + ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); + ggml_cgraph * gf = ggml_new_graph(cpy_ctx); ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer); std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0); @@ -8616,9 +8618,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat kv_head, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d)); + ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); ggml_free(cpy_ctx); @@ -8725,8 +8727,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const size_t elt_size = ggml_element_size(kv_self.k); - ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); - ggml_cgraph gf{}; + ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); + ggml_cgraph * gf = ggml_new_graph(cpy_ctx); ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer); kin3d->data = (void *) inp; @@ -8744,9 +8746,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { kv_head, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d)); + ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } |