diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-11-13 14:16:23 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-13 14:16:23 +0200 |
commit | 4760e7cc0b68570d58f55e8dda469805d1759d0d (patch) | |
tree | cd983b1f2833f0094c0539f7943703c6787bf12b /examples | |
parent | bb50a792ec2a49944470c82694fa364345e95170 (diff) |
sync : ggml (backend v2) (#3912)
* sync : ggml (backend v2) (wip)
* sync : migrate examples and llama.cpp to dynamic graphs (wip)
* sync : update tests + fix max op params to 64
ggml-ci
* sync : ggml-cuda
ggml-ci
* llama : fix save/load state context size
ggml-ci
* sync : try to fix build on tvOS
* sync : pass custom graph sizes in training examples
* sync : update graph copies to new ggml API
* sync : update sync-ggml.sh with new files
* scripts : fix header in sync script
* train : fix context size calculations
* llama : increase inference graph size up to 4096 nodes
* train : allocate grads for backward graphs
* train : allocate grads for gb_tmp
Diffstat (limited to 'examples')
-rw-r--r-- | examples/benchmark/benchmark-matmult.cpp | 21 | ||||
-rw-r--r-- | examples/export-lora/export-lora.cpp | 4 | ||||
-rw-r--r-- | examples/finetune/finetune.cpp | 23 | ||||
-rw-r--r-- | examples/llava/clip.cpp | 2 | ||||
-rw-r--r-- | examples/metal/metal.cpp | 10 | ||||
-rw-r--r-- | examples/train-text-from-scratch/train-text-from-scratch.cpp | 23 |
6 files changed, 42 insertions, 41 deletions
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 76e3f57c..284733b1 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -171,7 +171,8 @@ int main(int argc, char ** argv) { struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); // printf("Creating compute graph\n"); - struct ggml_cgraph gf = ggml_build_forward(m11xm2); + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, m11xm2); printf("n_threads=%i\n", benchmark_params.n_threads); @@ -180,9 +181,9 @@ int main(int argc, char ** argv) { std::vector<uint8_t> work_buffer; - ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads); + ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); - TENSOR_DUMP(gf.nodes[0]); + TENSOR_DUMP(gf->nodes[0]); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); @@ -200,7 +201,8 @@ int main(int argc, char ** argv) { struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); // printf("Creating compute graph\n"); - struct ggml_cgraph gf31 = ggml_build_forward(q31); + struct ggml_cgraph * gf31 = ggml_new_graph(ctx); + ggml_build_forward_expand(gf31, q31); // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); @@ -211,7 +213,8 @@ int main(int argc, char ** argv) { struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); //printf("Creating compute graph\n"); - struct ggml_cgraph gf32 = ggml_build_forward(q32); + struct ggml_cgraph * gf32 = ggml_new_graph(ctx); + ggml_build_forward_expand(gf32, q32); printf("n_threads=%i\n", benchmark_params.n_threads); const int dimx = sizex; @@ -223,7 +226,7 @@ int main(int argc, char ** argv) { // Let's use the F32 result from above as a reference for the quantized multiplication - float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); + float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("=====================================================================================\n"); @@ -233,7 +236,7 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); - ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads); + ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads); long long int stop = ggml_time_us(); long long int usec = stop-start; @@ -251,7 +254,7 @@ int main(int argc, char ** argv) { // Check that the matrix multiplication result is in the right ballpark // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different - float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]); + float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 @@ -266,7 +269,7 @@ int main(int argc, char ** argv) { } // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads); + ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); } printf("\n"); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index d803cfd5..c8754ce7 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -240,7 +240,7 @@ static struct lora_data * load_lora(struct lora_info * info) { } struct ggml_init_params params_ggml; - params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES; + params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE; params_ggml.mem_buffer = NULL; params_ggml.no_alloc = true; result->ctx = ggml_init(params_ggml); @@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r; struct ggml_init_params params; - params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; + params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; params.mem_buffer = NULL; params.no_alloc = true; struct ggml_context * ctx = NULL; diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index fa7dbe49..5a6cf22c 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -772,7 +772,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( if (enable_checkpointing) { ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); } else { - *gb = *gf; + ggml_graph_cpy(gf, gb); ggml_build_backward_expand(ctx, gf, gb, true); } @@ -1615,6 +1615,7 @@ int main(int argc, char ** argv) { opt->params = ggml_opt_default_params(GGML_OPT_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; + opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; opt->params.n_threads = params.common.n_threads; opt->params.past = params.common.opt_past; opt->params.delta = params.common.opt_delta; @@ -1741,11 +1742,9 @@ int main(int argc, char ** argv) { ggml_allocr_free(alloc); // context for compute tensors without their data - size_t estimated_compute_size_wo_data = ( - ggml_tensor_overhead()*GGML_MAX_NODES*2 - + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*( - params.common.use_checkpointing ? 3 : 2 - ) + const size_t estimated_compute_size_wo_data = ( + 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + + (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) ); struct ggml_init_params ctx_compute_params = { estimated_compute_size_wo_data, // mem_size @@ -1768,11 +1767,11 @@ int main(int argc, char ** argv) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { ctx_compute = ggml_init(ctx_compute_params); alloc = ggml_allocr_new_measure(tensor_alignment); - gf = ggml_new_graph(ctx_compute); + gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = (enum ggml_cgraph_eval_order) order; - gb = ggml_new_graph(ctx_compute); + gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb_tmp = params.common.use_checkpointing - ? ggml_new_graph(ctx_compute) + ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) : NULL; loss = llama_build_lora_finetune_graphs( &model, &lora, alloc, ctx_compute, @@ -1801,11 +1800,11 @@ int main(int argc, char ** argv) { mem_compute_data.resize(max_compute_size); ctx_compute = ggml_init(ctx_compute_params); alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); - gf = ggml_new_graph(ctx_compute); + gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = best_order; - gb = ggml_new_graph(ctx_compute); + gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb_tmp = params.common.use_checkpointing - ? ggml_new_graph(ctx_compute) + ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) : NULL; loss = llama_build_lora_finetune_graphs( &model, &lora, alloc, ctx_compute, diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 3c909c7d..c26ee495 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -664,7 +664,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // measure mem requirement and allocate { static const size_t tensor_alignment = 32; - new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead()); new_clip->alloc = ggml_allocr_new_measure(tensor_alignment); clip_image_f32_batch batch; batch.size = 1; diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index c05a4fa9..16c1146f 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -34,7 +34,7 @@ int main(int argc, char ** argv) { struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_eval = NULL; - struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); + struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); // this allocates all Metal resources and memory buffers auto * ctx_metal = ggml_metal_init(1); @@ -46,13 +46,13 @@ int main(int argc, char ** argv) { // main { - struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd"); + struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd"); *(int32_t *) input->data = 1; // BOS ggml_metal_set_tensor(ctx_metal, input); // warmup - ggml_metal_graph_compute(ctx_metal, &gf); + ggml_metal_graph_compute(ctx_metal, gf); const int n_iter = 16; @@ -60,7 +60,7 @@ int main(int argc, char ** argv) { // the actual inference happens here for (int i = 0; i < n_iter; ++i) { - ggml_metal_graph_compute(ctx_metal, &gf); + ggml_metal_graph_compute(ctx_metal, gf); } const int64_t t1 = ggml_time_us(); @@ -70,7 +70,7 @@ int main(int argc, char ** argv) { // debug output { - struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1]; + struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1]; ggml_metal_get_tensor(ctx_metal, logits); float * ptr = (float *) ggml_get_data(logits); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 2a257e63..f049a392 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -436,7 +436,7 @@ static struct ggml_tensor * llama_build_train_graphs( if (enable_checkpointing) { ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); } else { - *gb = *gf; + ggml_graph_cpy(gf, gb); ggml_build_backward_expand(ctx, gf, gb, true); } @@ -1006,6 +1006,7 @@ int main(int argc, char ** argv) { opt->params = ggml_opt_default_params(GGML_OPT_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; + opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; opt->params.n_threads = params.common.n_threads; opt->params.past = params.common.opt_past; opt->params.delta = params.common.opt_delta; @@ -1108,11 +1109,9 @@ int main(int argc, char ** argv) { ggml_allocr_free(alloc); // context for compute tensors without their data - size_t estimated_compute_size_wo_data = ( - ggml_tensor_overhead()*GGML_MAX_NODES*2 - + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*( - params.common.use_checkpointing ? 3 : 2 - ) + const size_t estimated_compute_size_wo_data = ( + 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + + (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) ); struct ggml_init_params ctx_compute_params = { estimated_compute_size_wo_data, // mem_size @@ -1135,11 +1134,11 @@ int main(int argc, char ** argv) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { ctx_compute = ggml_init(ctx_compute_params); alloc = ggml_allocr_new_measure(tensor_alignment); - gf = ggml_new_graph(ctx_compute); + gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = (enum ggml_cgraph_eval_order) order; - gb = ggml_new_graph(ctx_compute); + gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb_tmp = params.common.use_checkpointing - ? ggml_new_graph(ctx_compute) + ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) : NULL; loss = llama_build_train_graphs( &model, alloc, ctx_compute, @@ -1168,11 +1167,11 @@ int main(int argc, char ** argv) { mem_compute_data.resize(max_compute_size); ctx_compute = ggml_init(ctx_compute_params); alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); - gf = ggml_new_graph(ctx_compute); + gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = best_order; - gb = ggml_new_graph(ctx_compute); + gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb_tmp = params.common.use_checkpointing - ? ggml_new_graph(ctx_compute) + ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) : NULL; loss = llama_build_train_graphs( &model, alloc, ctx_compute, |