diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-11-13 14:16:23 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-13 14:16:23 +0200 |
commit | 4760e7cc0b68570d58f55e8dda469805d1759d0d (patch) | |
tree | cd983b1f2833f0094c0539f7943703c6787bf12b /ggml-backend.c | |
parent | bb50a792ec2a49944470c82694fa364345e95170 (diff) |
sync : ggml (backend v2) (#3912)
* sync : ggml (backend v2) (wip)
* sync : migrate examples and llama.cpp to dynamic graphs (wip)
* sync : update tests + fix max op params to 64
ggml-ci
* sync : ggml-cuda
ggml-ci
* llama : fix save/load state context size
ggml-ci
* sync : try to fix build on tvOS
* sync : pass custom graph sizes in training examples
* sync : update graph copies to new ggml API
* sync : update sync-ggml.sh with new files
* scripts : fix header in sync script
* train : fix context size calculations
* llama : increase inference graph size up to 4096 nodes
* train : allocate grads for backward graphs
* train : allocate grads for gb_tmp
Diffstat (limited to 'ggml-backend.c')
-rw-r--r-- | ggml-backend.c | 591 |
1 files changed, 578 insertions, 13 deletions
diff --git a/ggml-backend.c b/ggml-backend.c index ca8d83da..f6e5fcee 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1,7 +1,9 @@ -#include "ggml-backend.h" +#include "ggml-backend-impl.h" #include "ggml-alloc.h" +#include "ggml-impl.h" #include <assert.h> +#include <limits.h> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> @@ -33,6 +35,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init( } void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { + if (buffer == NULL) { + return; + } + if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } @@ -43,15 +49,20 @@ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { return ggml_backend_get_alignment(buffer->backend); } -void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { - return buffer->iface.get_base(buffer); -} - size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { return buffer->size; } +void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { + void * base = buffer->iface.get_base(buffer); + + GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); + + return base; +} + size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + // get_alloc_size is optional, defaults to ggml_nbytes if (buffer->iface.get_alloc_size) { return buffer->iface.get_alloc_size(buffer, tensor); } @@ -59,12 +70,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g } void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + // init_tensor is optional if (buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); } } void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + // free_tensor is optional if (buffer->iface.free_tensor) { buffer->iface.free_tensor(buffer, tensor); } @@ -73,14 +86,21 @@ void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_t // backend ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) { - return tensor->buffer->backend; + return tensor->buffer ? tensor->buffer->backend : NULL; } const char * ggml_backend_name(ggml_backend_t backend) { + if (backend == NULL) { + return "NULL"; + } return backend->iface.get_name(backend); } void ggml_backend_free(ggml_backend_t backend) { + if (backend == NULL) { + return; + } + backend->iface.free(backend); } @@ -101,13 +121,23 @@ void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * dat } void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); - ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); + ggml_backend_t backend = ggml_get_backend(tensor); + + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(backend != NULL && "tensor backend not set"); + + backend->iface.set_tensor_async(backend, tensor, data, offset, size); + backend->iface.synchronize(backend); } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); - ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); + ggml_backend_t backend = ggml_get_backend(tensor); + + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(backend != NULL && "tensor backend not set"); + + backend->iface.get_tensor_async(backend, tensor, data, offset, size); + backend->iface.synchronize(backend); } void ggml_backend_synchronize(ggml_backend_t backend) { @@ -156,7 +186,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); - // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src)); + // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src)); if (src == dst) { return; @@ -234,6 +264,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backen size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? + GGML_ASSERT(data != NULL && "failed to allocate buffer"); + return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size); } @@ -271,8 +303,7 @@ static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml } static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { - // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends - ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); + ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); UNUSED(backend); } @@ -383,3 +414,537 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) { return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size); } + +// scheduler + +#define GGML_MAX_BACKENDS 4 +#define GGML_MAX_SPLITS 256 +#define GGML_MAX_SPLIT_INPUTS 16 + +struct ggml_backend_sched_split { + ggml_tallocr_t tallocr; + int i_start; + int i_end; + struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; + int n_inputs; + struct ggml_cgraph * graph; +}; + +struct ggml_backend_sched { + int n_backends; + ggml_backend_t backends[GGML_MAX_BACKENDS]; + ggml_tallocr_t tallocs[GGML_MAX_BACKENDS]; + + ggml_gallocr_t galloc; + + struct ggml_hash_set hash_set; + ggml_tallocr_t * node_talloc; // [hash_set.size] + struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS] + + struct ggml_cgraph * graph; + struct ggml_backend_sched_split splits[GGML_MAX_SPLITS]; + int n_splits; + + struct ggml_context * ctx; + + // align context_buffer to GGML_MEM_ALIGN + #ifdef _MSC_VER + __declspec(align(GGML_MEM_ALIGN)) + #else + __attribute__((aligned(GGML_MEM_ALIGN))) + #endif + char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + GGML_MAX_SPLITS*sizeof(struct ggml_cgraph)]; +}; + +#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node) +#define node_allocr(node) sched->node_talloc[hash_id(node)] + +static bool ggml_is_view_op(enum ggml_op op) { + return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; +} + +// returns the priority of the backend, lower is better +static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) { + for (int i = 0; i < sched->n_backends; i++) { + if (sched->backends[i] == backend) { + return i; + } + } + return INT_MAX; +} + +static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) { + for (int i = 0; i < sched->n_backends; i++) { + if (sched->tallocs[i] == allocr) { + return i; + } + } + return INT_MAX; +} + +// returns the backend that should be used for the node based on the current locations +char causes[GGML_DEFAULT_GRAPH_SIZE*4 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove +static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) { + // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there + // ie. kv cache updates + // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend. + // dst + ggml_backend_t cur_backend = ggml_get_backend(node); + if (cur_backend != NULL) { + sprintf(causes[hash_id(node)], "1.dst"); + return cur_backend; + } + + // view_src + if (node->view_src != NULL && ggml_get_backend(node->view_src) != NULL) { + sprintf(causes[hash_id(node)], "1.vsrc"); + return ggml_get_backend(node->view_src); + } + + // src + int cur_prio = INT_MAX; + size_t cur_size = 0; + + for (int i = 0; i < GGML_MAX_SRC; i++) { + const struct ggml_tensor * src = node->src[i]; + if (src == NULL) { + break; + } + ggml_backend_t src_backend = ggml_get_backend(src); + if (src_backend != NULL) { + int src_prio = sched_backend_prio(sched, src_backend); + size_t src_size = ggml_nbytes(src); + if (src_prio < cur_prio && src_size >= cur_size) { + cur_prio = src_prio; + cur_size = src_size; + cur_backend = src_backend; + sprintf(causes[hash_id(node)], "1.src%d", i); + } + } + } + return cur_backend; +} + +static char * fmt_size(size_t size) { + static char buffer[128]; + if (size >= 1024*1024) { + sprintf(buffer, "%zuM", size/1024/1024); + } else { + sprintf(buffer, "%zuK", size/1024); + } + return buffer; +} + +static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { + int cur_split = 0; + for (int i = 0; i < graph->n_nodes; i++) { + if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { + ggml_backend_t split_backend = ggml_tallocr_get_buffer(sched->splits[cur_split].tallocr)->backend; + fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); + for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { + fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); + } + fprintf(stderr, "\n"); + cur_split++; + } + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + ggml_backend_t node_backend = node_allocr ? ggml_tallocr_get_buffer(node_allocr)->backend : NULL; + fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", causes[hash_id(node)]); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_tallocr_t src_allocr = node_allocr(src); + ggml_backend_t src_backend = src_allocr ? ggml_tallocr_get_buffer(src_allocr)->backend : NULL; + fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", causes[hash_id(src)]); + } + fprintf(stderr, "\n"); + } +} + +// creates a copy of the tensor with the same memory layout +static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) { + struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + dup->nb[i] = tensor->nb[i]; + } + return dup; +} + +// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend +// TODO: merge passes +static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { + // reset state + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); + memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); + memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + sched->n_splits = 0; + + struct ggml_init_params params = { + /*.mem_size = */ sizeof(sched->context_buffer), + /*.mem_buffer = */ sched->context_buffer, + /*.no_alloc = */ true + }; + + if (sched->ctx != NULL) { + ggml_free(sched->ctx); + } + + sched->ctx = ggml_init(params); + + // pass 1: assign backends to ops with allocated inputs + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + if (node_allocr(leaf) != NULL) { + // do not overwrite user assignments + continue; + } + ggml_backend_t leaf_backend = ggml_get_backend(leaf); + if (leaf_backend == NULL && leaf->view_src != NULL) { + leaf_backend = ggml_get_backend(leaf->view_src); + } + if (leaf_backend != NULL) { + node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend); + } + } + + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (node_allocr(node) != NULL) { + // do not overwrite user assignments + continue; + } + ggml_backend_t node_backend = sched_backend_from_cur(sched, node); + if (node_backend != NULL) { + node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend); + } + } + //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + + // pass 2: assign backends to ops from current assignments + // TODO: + // - reuse sched_backend_from_cur + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr == NULL) { + int cur_prio = INT_MAX; + size_t cur_size = 0; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr != NULL) { + int src_prio = sched_allocr_prio(sched, src_allocr); + size_t src_size = ggml_nbytes(src); + if (src_prio < cur_prio && src_size >= cur_size) { + cur_prio = src_prio; + cur_size = src_size; + node_allocr = src_allocr; + sprintf(causes[hash_id(node)], "2.src%d", j); + } + } + } + if (node_allocr != NULL) { + node_allocr(node) = node_allocr; + } + } + } + //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + + // pass 3: assign backends to remaining src from dst (should only be leafs) + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + ggml_tallocr_t node_allocr = node_allocr(node); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr == NULL) { + node_allocr(src) = node_allocr; + } + } + } + //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + + // pass 4: split graph, find tensors that need to be copied + // TODO: + // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost + // find first backend + int cur_split = 0; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (node->view_src == NULL) { + sched->splits[0].tallocr = node_allocr(node); + break; + } + } + sched->splits[0].i_start = 0; + sched->splits[0].n_inputs = 0; + memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK + ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; + size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + + if (ggml_is_view_op(node->op)) { + continue; + } + + ggml_tallocr_t node_allocr = node_allocr(node); + + if (node_allocr != cur_allocr) { + sched->splits[cur_split].i_end = i; + cur_split++; + GGML_ASSERT(cur_split < GGML_MAX_SPLITS); + sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].i_start = i; + sched->splits[cur_split].n_inputs = 0; + memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK + cur_allocr = node_allocr; + cur_backend_id = sched_allocr_prio(sched, cur_allocr); + } + + // find inputs that are not on the same backend + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr != node_allocr) { + int n_inputs = sched->splits[cur_split].n_inputs++; + GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); + sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src; + + // create copies + size_t id = hash_id(src); + if (sched->node_copies[id][cur_backend_id] == NULL) { + struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + sched->node_copies[id][cur_backend_id] = tensor_copy; + node_allocr(tensor_copy) = cur_allocr; + ggml_backend_t backend = ggml_tallocr_get_buffer(cur_allocr)->backend; + ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); + } + node->src[j] = sched->node_copies[id][cur_backend_id]; + } + } + } + sched->splits[cur_split].i_end = graph->n_nodes; + sched->n_splits = cur_split + 1; + + //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout); + +#if 1 + // sanity check: all sources should have the same backend as the node + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr == NULL) { + fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_tallocr_t src_allocr = node_allocr(src); + if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now + fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", + node->name, node_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(node_allocr)->backend) : "NULL", + j, src->name, src_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(src_allocr)->backend) : "NULL"); + } + } + } +#endif + + // create copies of the graph for each split + // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way + struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false); + for (int i = 0; i < sched->n_splits; i++) { + struct ggml_backend_sched_split * split = &sched->splits[i]; + split->graph = ggml_graph_view(sched->ctx, graph, split->i_start, split->i_end); + + // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split + for (int j = 0; j < split->n_inputs; j++) { + struct ggml_tensor * input = split->inputs[j]; + struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + input_cpy->src[0] = input; + graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; + } + + for (int j = split->i_start; j < split->i_end; j++) { + graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; + } + } + sched->graph = graph_copy; +} + +static void sched_alloc_splits(ggml_backend_sched_t sched) { + ggml_gallocr_alloc_graph_n( + sched->galloc, + sched->graph, + sched->hash_set, + sched->node_talloc); +} + +static void sched_compute_splits(ggml_backend_sched_t sched) { + uint64_t copy_us[GGML_MAX_BACKENDS] = {0}; + uint64_t compute_us[GGML_MAX_BACKENDS] = {0}; + + struct ggml_backend_sched_split * splits = sched->splits; + + for (int i = 0; i < sched->n_splits; i++) { + struct ggml_backend_sched_split * split = &splits[i]; + ggml_backend_t split_backend = ggml_tallocr_get_buffer(split->tallocr)->backend; + int split_backend_id = sched_backend_prio(sched, split_backend); + + // copy the input tensors to the split backend + uint64_t copy_start_us = ggml_time_us(); + for (int j = 0; j < split->n_inputs; j++) { + struct ggml_tensor * input_cpy = sched->node_copies[hash_id(split->inputs[j])][sched_backend_prio(sched, split_backend)]; + if (split->inputs[j]->buffer == NULL) { + if (split->inputs[j]->view_src == NULL) { + fprintf(stderr, "input %s has no buffer and no view_src\n", split->inputs[j]->name); + exit(1); + } + struct ggml_tensor * view = split->inputs[j]; + view->backend = view->view_src->backend; + view->buffer = view->view_src->buffer; + view->data = (char *)view->view_src->data + view->view_offs; + ggml_backend_buffer_init_tensor(ggml_backend_sched_get_buffer(sched, view->buffer->backend), view); + } + if (input_cpy->buffer == NULL) { + fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name); + exit(1); + } + GGML_ASSERT(split->inputs[j]->buffer->backend != input_cpy->buffer->backend); + GGML_ASSERT(input_cpy->buffer->backend == split_backend); + ggml_backend_tensor_copy(split->inputs[j], input_cpy); + } + // ggml_backend_synchronize(split_backend); + int64_t copy_end_us = ggml_time_us(); + copy_us[split_backend_id] += copy_end_us - copy_start_us; + +#if 0 + char split_filename[GGML_MAX_NAME]; + snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend)); + ggml_graph_dump_dot(split->graph, NULL, split_filename); +#endif + + uint64_t compute_start_us = ggml_time_us(); + ggml_backend_graph_compute(split_backend, split->graph); + // ggml_backend_synchronize(split_backend); + uint64_t compute_end_us = ggml_time_us(); + compute_us[split_backend_id] += compute_end_us - compute_start_us; + } + +#if 0 + // per-backend timings + fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits); + for (int i = 0; i < sched->n_backends; i++) { + if (copy_us[i] > 0 || compute_us[i] > 0) { + fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]); + } + } +#endif +} + +static void sched_reset(ggml_backend_sched_t sched) { + for (int i = 0; i < sched->n_backends; i++) { + ggml_tallocr_reset(sched->tallocs[i]); + } +} + +ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) { + GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); + + struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched)); + memset(sched, 0, sizeof(struct ggml_backend_sched)); + + fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024); + + sched->n_backends = n_backends; + for (int i = 0; i < n_backends; i++) { + sched->backends[i] = backends[i]; + } + + sched->galloc = ggml_gallocr_new(); + + // init measure allocs for each backend + for (int i = 0; i < n_backends; i++) { + sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]); + } + + return sched; +} + +void ggml_backend_sched_free(ggml_backend_sched_t sched) { + if (sched == NULL) { + return; + } + for (int i = 0; i < sched->n_backends; i++) { + ggml_tallocr_free(sched->tallocs[i]); + } + ggml_gallocr_free(sched->galloc); + free(sched->hash_set.keys); + free(sched->node_talloc); + free(sched->node_copies); + free(sched); +} + +void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { + // initialize hash tables + size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS; + sched->hash_set.size = hash_size; + sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); + sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size); + sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size); + + sched_split_graph(sched, measure_graph); + sched_alloc_splits(sched); + + // allocate buffers and reset allocators + for (int i = 0; i < sched->n_backends; i++) { + size_t size = ggml_tallocr_max_size(sched->tallocs[i]); + ggml_tallocr_free(sched->tallocs[i]); + sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size); + } + + sched_reset(sched); +} + +void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { + GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + + sched_split_graph(sched, graph); + sched_alloc_splits(sched); + sched_compute_splits(sched); + sched_reset(sched); +} + +ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) { + int backend_index = sched_backend_prio(sched, backend); + return sched->tallocs[backend_index]; +} + +ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) { + int backend_index = sched_backend_prio(sched, backend); + return ggml_tallocr_get_buffer(sched->tallocs[backend_index]); +} + +void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { + int backend_index = sched_backend_prio(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); + node_allocr(node) = sched->tallocs[backend_index]; +} |