summaryrefslogtreecommitdiff
path: root/llama.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp')
-rw-r--r--llama.cpp40
1 files changed, 21 insertions, 19 deletions
diff --git a/llama.cpp b/llama.cpp
index a5f3876c..76ee4ea2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -91,6 +91,8 @@
#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
+#define LLAMA_MAX_NODES 4096
+
//
// logging
//
@@ -3618,7 +3620,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_llama() {
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -3730,7 +3732,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_baichuan() {
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -3850,7 +3852,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_falcon() {
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -3972,7 +3974,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_starcoder() {
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_tensor * cur;
struct ggml_tensor * pos;
@@ -4071,7 +4073,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_persimmon() {
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_rot = n_embd_head / 2;
@@ -4281,7 +4283,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_refact() {
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -4372,7 +4374,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bloom() {
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -4466,7 +4468,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_mpt() {
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -8208,7 +8210,7 @@ struct llama_context * llama_new_context_with_model(
{
static const size_t tensor_alignment = 32;
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
- ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
+ ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
// create measure allocator
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
@@ -8597,8 +8599,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
if (kv_buf_size) {
const size_t elt_size = ggml_element_size(kv_self.k);
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
- ggml_cgraph gf{};
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -8616,9 +8618,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
kv_head, n_embd, n_layer,
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
ggml_free(cpy_ctx);
@@ -8725,8 +8727,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
const size_t elt_size = ggml_element_size(kv_self.k);
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
- ggml_cgraph gf{};
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
kin3d->data = (void *) inp;
@@ -8744,9 +8746,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
kv_head, n_embd, n_layer,
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
ggml_free(cpy_ctx);
}