diff options
Diffstat (limited to 'ggml-alloc.c')
-rw-r--r-- | ggml-alloc.c | 1053 |
1 files changed, 563 insertions, 490 deletions
diff --git a/ggml-alloc.c b/ggml-alloc.c index f9be6e1c..c28c37c4 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -17,6 +17,50 @@ //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) #define AT_PRINTF(...) + +static bool ggml_is_view(const struct ggml_tensor * t) { + return t->view_src != NULL; +} + +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +static bool ggml_op_can_inplace(enum ggml_op op) { + switch (op) { + case GGML_OP_SCALE: + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_UNARY: + case GGML_OP_ROPE: + case GGML_OP_RMS_NORM: + case GGML_OP_SOFT_MAX: + return true; + + default: + return false; + } +} + // TODO: GGML_PAD ? static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { assert(alignment && !(alignment & (alignment - 1))); // power of 2 @@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen return offset + align; } +// tallocr +struct ggml_tallocr { + ggml_backend_buffer_t buffer; + void * base; + size_t alignment; + size_t offset; +}; + +ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) { + ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr)); + if (talloc == NULL) { + return NULL; + } + + void * base = ggml_backend_buffer_get_base(buffer); + size_t align = ggml_backend_buffer_get_alignment(buffer); + + assert(align && !(align & (align - 1))); // power of 2 + + *talloc = (struct ggml_tallocr) { + /*.buffer = */ buffer, + /*.base = */ base, + /*.alignment = */ align, + /*.offset = */ aligned_offset(base, 0, align), + }; + return talloc; +} + +void ggml_tallocr_free(ggml_tallocr_t talloc) { + free(talloc); +} + +void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) { + size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); + size = GGML_PAD(size, talloc->alignment); + + if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { + fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } + + void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; + talloc->offset += size; + + assert(((uintptr_t)addr % talloc->alignment) == 0); + + ggml_backend_tensor_alloc(talloc->buffer, tensor, addr); +} + +// dynamic tensor allocator + struct free_block { - void * addr; + size_t offset; size_t size; }; -struct ggml_tallocr { - struct ggml_backend_buffer * buffer; - bool buffer_owned; - void * base; +struct ggml_dyn_tallocr { size_t alignment; - int n_free_blocks; struct free_block free_blocks[MAX_FREE_BLOCKS]; - size_t max_size; - bool measure; - #ifdef GGML_ALLOCATOR_DEBUG - struct ggml_tensor * allocated_tensors[1024]; + struct { + const struct ggml_tensor * tensor; + size_t offset; + } allocated_tensors[1024]; #endif }; #ifdef GGML_ALLOCATOR_DEBUG -static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { +static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i] == NULL) { - alloc->allocated_tensors[i] = tensor; + if (alloc->allocated_tensors[i].tensor == NULL) { + alloc->allocated_tensors[i].tensor = tensor; + alloc->allocated_tensors[i].offset = offset; return; } } GGML_ASSERT(!"out of allocated_tensors"); } -static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { +static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i] == tensor || - (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { - alloc->allocated_tensors[i] = NULL; + if (alloc->allocated_tensors[i].offset == offset) { + alloc->allocated_tensors[i].tensor = NULL; return; } } - printf("tried to free tensor %s not found\n", tensor->name); + fprintf(stderr, "tried to free tensor %s not found\n", tensor->name); GGML_ASSERT(!"tensor not found"); } #endif -// check if a tensor is allocated by this buffer -static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) { - return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer); -} - -static bool ggml_is_view(struct ggml_tensor * t) { - return t->view_src != NULL; -} - -void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { - GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources - GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated - - size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); +static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); @@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { if (block->size >= size) { best_fit_block = alloc->n_free_blocks - 1; } else { - fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n", - __func__, tensor->name, size, max_avail); + // this should never happen + fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", + __func__, size, max_avail); GGML_ASSERT(!"not enough space in the buffer"); - return; + GGML_UNREACHABLE(); } } struct free_block * block = &alloc->free_blocks[best_fit_block]; - void * addr = block->addr; - block->addr = (char*)block->addr + size; + size_t offset = block->offset; + block->offset = offset + size; block->size -= size; if (block->size == 0) { // remove block if empty @@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { } } - AT_PRINTF("block %d, addr %p\n", best_fit_block, addr); - - tensor->data = addr; - tensor->buffer = alloc->buffer; - if (!alloc->measure) { - ggml_backend_buffer_init_tensor(alloc->buffer, tensor); - } + AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset); #ifdef GGML_ALLOCATOR_DEBUG - add_allocated_tensor(alloc, tensor); - size_t cur_max = (char*)addr - (char*)alloc->base + size; + add_allocated_tensor(alloc, offset, tensor); + size_t cur_max = offset + size; if (cur_max > alloc->max_size) { - printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + // sort allocated_tensors by offset + for (int i = 0; i < 1024; i++) { + for (int j = i + 1; j < 1024; j++) { + if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) { + const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor; + size_t tmp_offset = alloc->allocated_tensors[i].offset; + alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor; + alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset; + alloc->allocated_tensors[j].tensor = tmp_tensor; + alloc->allocated_tensors[j].offset = tmp_offset; + } + } + } + fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i]) { - printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); + if (alloc->allocated_tensors[i].tensor) { + fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, + alloc->allocated_tensors[i].offset, + alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), + ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); } } - printf("\n"); + fprintf(stderr, "\n"); } #endif - alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size); -} + alloc->max_size = MAX(alloc->max_size, offset + size); -// this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { - if (ggml_tallocr_is_own(alloc, tensor) == false) { - // the tensor was not allocated in this buffer - // this can happen because the graph allocator will try to free weights and other tensors from different buffers - // the easiest way to deal with this is just to ignore it - // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer); - return; - } + return offset; - void * ptr = tensor->data; + GGML_UNUSED(tensor); +} - size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) { size = aligned_offset(NULL, size, alloc->alignment); - AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); + + AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks); #ifdef GGML_ALLOCATOR_DEBUG - remove_allocated_tensor(alloc, tensor); + remove_allocated_tensor(alloc, offset, tensor); #endif // see if we can merge with an existing block for (int i = 0; i < alloc->n_free_blocks; i++) { struct free_block * block = &alloc->free_blocks[i]; // check if ptr is at the end of the block - if ((char*)block->addr + block->size == ptr) { + if (block->offset + block->size == offset) { block->size += size; // check if we can merge with the next block - if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) { + if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) { block->size += alloc->free_blocks[i+1].size; alloc->n_free_blocks--; for (int j = i+1; j < alloc->n_free_blocks; j++) { @@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * return; } // check if ptr is at the beginning of the block - if ((char*)ptr + size == block->addr) { - block->addr = ptr; + if (offset + size == block->offset) { + block->offset = offset; block->size += size; // check if we can merge with the previous block - if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) { + if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) { alloc->free_blocks[i-1].size += block->size; alloc->n_free_blocks--; for (int j = i; j < alloc->n_free_blocks; j++) { @@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) int insert_pos = 0; - while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) { + while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) { insert_pos++; } // shift all blocks from insert_pos onward to make room for the new block @@ -216,337 +301,271 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * alloc->free_blocks[i] = alloc->free_blocks[i-1]; } // insert the new block - alloc->free_blocks[insert_pos].addr = ptr; + alloc->free_blocks[insert_pos].offset = offset; alloc->free_blocks[insert_pos].size = size; alloc->n_free_blocks++; + + GGML_UNUSED(tensor); } -void ggml_tallocr_reset(ggml_tallocr_t alloc) { +static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { alloc->n_free_blocks = 1; - size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment); - alloc->free_blocks[0].addr = (char *)alloc->base + align_offset; - - if (alloc->measure) { - alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows - } else { - alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; - ggml_backend_buffer_reset(alloc->buffer); - } + alloc->free_blocks[0].offset = 0; + alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows + alloc->max_size = 0; } -ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) { - struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size); - - ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); +static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) { + struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr)); - *alloc = (struct ggml_tallocr) { - /*.buffer = */ buffer, - /*.buffer_owned = */ true, - /*.base = */ ggml_backend_buffer_get_base(buffer), + *alloc = (struct ggml_dyn_tallocr) { /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, /*.max_size = */ 0, - /*.measure = */ false, #ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, + /*.allocated_tensors = */ {{0}}, #endif }; - ggml_tallocr_reset(alloc); - - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) { - ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment); - alloc->measure = true; + ggml_dyn_tallocr_reset(alloc); return alloc; } -ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) { - // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1); - - // TODO: move alloc initialization to a common ggml_tallocr_new_impl function - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); - alloc->buffer_owned = true; - alloc->measure = true; - ggml_tallocr_reset(alloc); - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { - return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend)); -} - -ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) { - // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); - alloc->buffer_owned = true; - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { - return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size); -} - -ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) { - ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); - - *alloc = (struct ggml_tallocr) { - /*.buffer = */ buffer, - /*.buffer_owned = */ false, - /*.base = */ ggml_backend_buffer_get_base(buffer), - /*.alignment = */ ggml_backend_buffer_get_alignment(buffer), - /*.n_free_blocks = */ 0, - /*.free_blocks = */ {{0}}, - /*.max_size = */ 0, - /*.measure = */ false, -#ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, -#endif - }; - - ggml_tallocr_reset(alloc); - - return alloc; -} - -struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) { - return alloc->buffer; -} - -void ggml_tallocr_free(ggml_tallocr_t alloc) { - if (alloc == NULL) { - return; - } - - if (alloc->buffer_owned) { - ggml_backend_buffer_free(alloc->buffer); - } +static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) { free(alloc); } -bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) { - return alloc->measure; +static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) { + return alloc->max_size; } -size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) { - // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail - // to avoid this, we add a 10% margin to the buffer size - return alloc->max_size + alloc->max_size/10; -} + +///////////////////////////////////// // graph allocator struct hash_node { int n_children; int n_views; + int buffer_id; + size_t offset; // offset within the buffer + bool allocated; +}; + +// +struct tensor_alloc { + size_t offset; + size_t size_max; // 0 = pre-allocated, unused, or view +}; + +struct node_alloc { + int buffer_id; + struct tensor_alloc dst; + struct tensor_alloc src[GGML_MAX_SRC]; }; struct ggml_gallocr { - ggml_tallocr_t talloc; + ggml_backend_buffer_type_t * bufts; // [n_buffers] + ggml_backend_buffer_t * buffers; // [n_buffers] + struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] + int n_buffers; + struct ggml_hash_set hash_set; - struct hash_node * hash_values; - size_t hash_values_size; - ggml_tallocr_t * hash_allocs; - int * parse_seq; - int parse_seq_len; + struct hash_node * hash_values; // [hash_set.size] + + struct node_alloc * node_allocs; // [n_nodes] + int n_nodes; }; -ggml_gallocr_t ggml_gallocr_new(void) { - ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr)); - - *galloc = (struct ggml_gallocr) { - /*.talloc = */ NULL, - /*.hash_set = */ {0}, - /*.hash_values = */ NULL, - /*.hash_values_size = */ 0, - /*.hash_allocs = */ NULL, - /*.parse_seq = */ NULL, - /*.parse_seq_len = */ 0, - }; +ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { + ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1); + GGML_ASSERT(galloc != NULL); + + galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1); + GGML_ASSERT(galloc->bufts != NULL); + + galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1); + GGML_ASSERT(galloc->buffers != NULL); + + galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1); + GGML_ASSERT(galloc->buf_tallocs != NULL); + + for (int i = 0; i < n_bufs; i++) { + galloc->bufts[i] = bufts[i]; + galloc->buffers[i] = NULL; + size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); + galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + } + galloc->n_buffers = n_bufs; return galloc; } +ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) { + return ggml_gallocr_new_n(&buft, 1); +} + void ggml_gallocr_free(ggml_gallocr_t galloc) { if (galloc == NULL) { return; } - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); - } - if (galloc->hash_values != NULL) { - free(galloc->hash_values); - } - if (galloc->hash_allocs != NULL) { - free(galloc->hash_allocs); - } - if (galloc->parse_seq != NULL) { - free(galloc->parse_seq); + for (int i = 0; i < galloc->n_buffers; i++) { + if (galloc->buffers != NULL) { + ggml_backend_buffer_free(galloc->buffers[i]); + } + if (galloc->buf_tallocs != NULL) { + ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + } } + + free(galloc->hash_set.keys); + free(galloc->hash_values); + free(galloc->bufts); + free(galloc->buffers); + free(galloc->buf_tallocs); + free(galloc->node_allocs); free(galloc); } -void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) { - free(galloc->parse_seq); - galloc->parse_seq = malloc(sizeof(int) * n); +typedef struct ggml_gallocr * ggml_gallocr_t; - for (int i = 0; i < n; i++) { - galloc->parse_seq[i] = list[i]; - } - galloc->parse_seq_len = n; -} - -static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { +static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { size_t i = ggml_hash_find_or_insert(galloc->hash_set, t); return &galloc->hash_values[i]; } -static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { - if (a->type != b->type) { - return false; - } - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if (a->ne[i] != b->ne[i]) { - return false; - } - if (a->nb[i] != b->nb[i]) { - return false; - } - } - return true; +static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { + return ggml_gallocr_hash_get(galloc, t)->allocated; } -static bool ggml_op_can_inplace(enum ggml_op op) { - switch (op) { - case GGML_OP_SCALE: - case GGML_OP_DIAG_MASK_ZERO: - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_ADD: - case GGML_OP_ADD1: - case GGML_OP_SUB: - case GGML_OP_MUL: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_UNARY: - case GGML_OP_ROPE: - case GGML_OP_RMS_NORM: - case GGML_OP_SOFT_MAX: - return true; - - default: - return false; - } +static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + hn->buffer_id = buffer_id; + hn->offset = offset; + hn->allocated = true; } -static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) { - if (galloc->talloc != NULL) { - return galloc->talloc; - } - - return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)]; +static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { + return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; } -static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) { - ggml_tallocr_t alloc = node_tallocr(galloc, view); - - GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL); - if (update_backend) { - view->backend = view->view_src->backend; - } - // views are initialized in the alloc buffer rather than the view_src buffer - view->buffer = alloc->buffer; - view->data = (char *)view->view_src->data + view->view_offs; +static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); + if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) { + hn->allocated = true; + assert(hn->offset == 0); - if (!alloc->measure) { - ggml_backend_buffer_init_tensor(alloc->buffer, view); - } -} + // try to reuse a parent's buffer (inplace) + if (ggml_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } -static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - ggml_tallocr_t alloc = node_tallocr(galloc, node); + // if the node's data is external, then we cannot re-use it + if (!ggml_gallocr_is_own(galloc, parent)) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } - if (node->data == NULL) { - if (ggml_is_view(node)) { - init_view(galloc, node, true); - } else { - // see if we can reuse a parent's buffer (inplace) - if (ggml_op_can_inplace(node->op)) { - for (int i = 0; i < GGML_MAX_SRC; i++) { - struct ggml_tensor * parent = node->src[i]; - if (parent == NULL) { - break; - } + // outputs cannot be reused + if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) { + AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name); + continue; + } - // if the node's data is external, then we cannot re-use it - if (ggml_tallocr_is_own(alloc, parent) == false) { - AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); - continue; - } + if (!ggml_are_same_layout(node, parent)) { + AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name); + continue; + } - struct hash_node * p_hn = hash_get(galloc, parent); - if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(galloc, view_src); - if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { - // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite - // the parent's data that it will need later (same layout requirement). the problem is that then - // we cannot free the tensor because the original address of the allocation is lost. - // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views - // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) - AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); - node->view_src = view_src; - view_src_hn->n_views += 1; - init_view(galloc, node, false); - return; - } - } else { - AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - node->view_src = parent; - p_hn->n_views += 1; - init_view(galloc, node, false); + struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); + if (p_hn->n_children == 1 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + assert(view_src_hn->offset == p_hn->offset); + hn->buffer_id = p_hn->buffer_id; + hn->offset = p_hn->offset; + p_hn->allocated = false; // avoid freeing the parent + view_src_hn->allocated = false; return; } + } else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + hn->buffer_id = p_hn->buffer_id; + hn->offset = p_hn->offset; + p_hn->allocated = false; // avoid freeing the parent + return; } } } - ggml_tallocr_alloc(alloc, node); } + // allocate tensor from the buffer + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; + size_t size = ggml_backend_buft_get_alloc_size(buft, node); + size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node); + hn->buffer_id = buffer_id; + hn->offset = offset; + return; } } -static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - ggml_tallocr_t alloc = node_tallocr(galloc, node); +static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { + // graph outputs are never freed + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { + AT_PRINTF("not freeing output %s\n", node->name); + return; + } - ggml_tallocr_free_tensor(alloc, node); + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + size_t offset = hn->offset; + size_t size = ggml_backend_buft_get_alloc_size(buft, node); + ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); + hn->allocated = false; } -static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) { - const int * parse_seq = galloc->parse_seq; - int parse_seq_len = galloc->parse_seq_len; +static int get_node_buffer_id(const int * node_buffer_ids, int i) { + return node_buffer_ids ? node_buffer_ids[i] : 0; +} + +static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { + // clear hash tables + memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); + memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); + + // allocate all graph inputs first to avoid overwriting them + for (int i = 0; i < graph->n_nodes; i++) { + if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (graph->nodes[i]->src[j] == NULL) { + break; + } + if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i)); + } + } + } // count number of children and views - for (int i = 0; i < gf->n_nodes; i++) { - struct ggml_tensor * node = gf->nodes[i]; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view(node)) { struct ggml_tensor * view_src = node->view_src; - hash_get(galloc, view_src)->n_views += 1; - if (node->buffer == NULL && node->data != NULL) { - // view of a pre-allocated tensor, didn't call init_view() yet - init_view(galloc, node, true); - } + ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr if (parent == NULL) { break; } - hash_get(galloc, parent)->n_children += 1; - if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { - init_view(galloc, parent, true); - } + ggml_gallocr_hash_get(galloc, parent)->n_children += 1; } } // allocate tensors - // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers - int last_barrier_pos = 0; - int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes; - - for (int ind = 0; ind < n_nodes; ind++) { - // allocate a node if there is no parse_seq or this is not a barrier - if (parse_seq_len == 0 || parse_seq[ind] != -1) { - int i = parse_seq_len ? parse_seq[ind] : ind; - struct ggml_tensor * node = gf->nodes[i]; - - // allocate parents (leafs) - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - allocate_node(galloc, parent); + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + int buffer_id = get_node_buffer_id(node_buffer_ids, i); + + // allocate parents (only leafs need to be allocated at this point) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; } + ggml_gallocr_allocate_node(galloc, parent, buffer_id); + } - // allocate node - allocate_node(galloc, node); + // allocate node + ggml_gallocr_allocate_node(galloc, node, buffer_id); - AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - AT_PRINTF("%s", parent->name); - if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { - AT_PRINTF(", "); - } + AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); } - AT_PRINTF("\n"); } + AT_PRINTF("\n"); // update parents - // update immediately if there is no parse_seq - // update only at barriers if there is parse_seq - if ((parse_seq_len == 0) || parse_seq[ind] == -1) { - int update_start = parse_seq_len ? last_barrier_pos : ind; - int update_end = parse_seq_len ? ind : ind + 1; - for (int i = update_start; i < update_end; i++) { - int node_i = parse_seq_len ? parse_seq[i] : i; - struct ggml_tensor * node = gf->nodes[node_i]; - - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(galloc, parent); - p_hn->n_children -= 1; - - //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - - if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(galloc, view_src); - view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); - if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) { - free_node(galloc, view_src); - } - } - else { - free_node(galloc, parent); - } + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); + p_hn->n_children -= 1; + + AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n", + parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", + view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { + ggml_gallocr_free_node(galloc, view_src, buffer_id); } } + else if (p_hn->allocated) { + ggml_gallocr_free_node(galloc, parent, buffer_id); + } } AT_PRINTF("\n"); - if (parse_seq_len) { - last_barrier_pos = ind + 1; - } } } } -size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) { +bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { size_t hash_size = graph->visited_hash_table.size; - // check if the hash table is initialized and large enough + // initialize hash table if (galloc->hash_set.size < hash_size) { - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); - } - if (galloc->hash_values != NULL) { - free(galloc->hash_values); - } - galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size); + free(galloc->hash_set.keys); + free(galloc->hash_values); galloc->hash_set.size = hash_size; - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size); + galloc->hash_values = calloc(sizeof(struct hash_node), hash_size); + GGML_ASSERT(galloc->hash_set.keys != NULL); + GGML_ASSERT(galloc->hash_values != NULL); + } else { + // reset hash table + memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size); + memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); } - // reset hash table - memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size); - memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); - - galloc->talloc = talloc; - ggml_tallocr_alloc_graph_impl(galloc, graph); - galloc->talloc = NULL; - - size_t max_size = ggml_tallocr_max_size(talloc); - - return max_size; -} - -void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) { - const size_t hash_size = hash_set.size; - - GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs)); + // reset allocators + for (int i = 0; i < galloc->n_buffers; i++) { + ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]); + } - galloc->talloc = NULL; + // allocate in hash table + ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids); - // alloc hash_values if needed - if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) { - free(galloc->hash_values); - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); - galloc->hash_values_size = hash_size; + // set the node_allocs from the hash table + if (galloc->n_nodes < graph->n_nodes) { + free(galloc->node_allocs); + galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes); + GGML_ASSERT(galloc->node_allocs != NULL); } - - // free hash_set.keys if needed - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); + galloc->n_nodes = graph->n_nodes; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; + node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); + if (node->view_src || node->data) { + node_alloc->dst.offset = SIZE_MAX; + node_alloc->dst.size_max = 0; + } else { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + node_alloc->dst.offset = hn->offset; + node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (!src || src->view_src || src->data) { + node_alloc->src[j].offset = SIZE_MAX; + node_alloc->src[j].size_max = 0; + } else { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); + node_alloc->src[j].offset = hn->offset; + node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); + } + } } - galloc->hash_set = hash_set; - // reset hash values - memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); + // reallocate buffers if needed + for (int i = 0; i < galloc->n_buffers; i++) { + size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; + size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); - galloc->hash_allocs = hash_node_talloc; - - ggml_tallocr_alloc_graph_impl(galloc, graph); + if (new_size > cur_size) { +#ifndef NDEBUG + fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + ggml_backend_buffer_free(galloc->buffers[i]); + galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); + if (galloc->buffers[i] == NULL) { + fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + return false; + } + } + } - // remove unowned resources - galloc->hash_set.keys = NULL; - galloc->hash_allocs = NULL; + return true; } -// legacy API wrapper - -struct ggml_allocr { - ggml_tallocr_t talloc; - ggml_gallocr_t galloc; -}; - -static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) { - ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr)); - *alloc = (struct ggml_allocr) { - /*.talloc = */ talloc, - /*.galloc = */ ggml_gallocr_new(), - }; - return alloc; +bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { + return ggml_gallocr_reserve_n(galloc, graph, NULL); } -ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) { - return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment)); -} +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) { + assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); -ggml_allocr_t ggml_allocr_new_measure(size_t alignment) { - return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment)); -} + if (node->view_src != NULL) { + if (node->buffer == NULL) { + assert(tensor_alloc->offset == SIZE_MAX); + if (node->view_src->buffer == NULL) { + // this tensor was allocated without ggml-backend + return; + } + ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node); + } + } else { + if (node->data == NULL) { + assert(tensor_alloc->offset != SIZE_MAX); + assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); + void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]); + void * addr = (char *)base + tensor_alloc->offset; + ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr); + } else { + if (node->buffer == NULL) { + // this tensor was allocated without ggml-backend + return; + } -ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) { - return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer)); +#ifndef NDEBUG + size_t offset = + (char *)node->data - + (char *)ggml_backend_buffer_get_base(node->buffer); + size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node); + assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset); + assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max); +#endif + } + } } -ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) { - return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size)); +static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { + ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; + size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); + return talloc->size_max >= node_size; } -ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) { - return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend)); -} +static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { + if (galloc->n_nodes != graph->n_nodes) { +#ifndef NDEBUG + fprintf(stderr, "%s: graph has different number of nodes\n", __func__); +#endif + return true; + } -struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) { - return ggml_tallocr_get_buffer(alloc->talloc); -} + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; -void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) { - ggml_gallocr_set_parse_seq(alloc->galloc, list, n); -} + if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { +#ifndef NDEBUG + fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); +#endif + return true; + } -void ggml_allocr_free(ggml_allocr_t alloc) { - if (alloc == NULL) { - return; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { +#ifndef NDEBUG + fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); +#endif + return true; + } + } } - ggml_gallocr_free(alloc->galloc); - ggml_tallocr_free(alloc->talloc); - free(alloc); + return false; } -bool ggml_allocr_is_measure(ggml_allocr_t alloc) { - return ggml_tallocr_is_measure(alloc->talloc); -} +bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { + if (ggml_gallocr_needs_realloc(galloc, graph)) { + if (galloc->n_buffers == 1) { +#ifndef NDEBUG + fprintf(stderr, "%s: reallocating buffers automatically\n", __func__); +#endif + if (!ggml_gallocr_reserve(galloc, graph)) { + return false; + } + } else { +#ifndef NDEBUG + fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); +#endif + return false; + } + } -void ggml_allocr_reset(ggml_allocr_t alloc) { - ggml_tallocr_reset(alloc->talloc); -} + // reset buffers + for (int i = 0; i < galloc->n_buffers; i++) { + // zero size buffers are not allocated + if (galloc->buffers[i] != NULL) { + ggml_backend_buffer_reset(galloc->buffers[i]); + } + } -void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) { - ggml_tallocr_alloc(alloc->talloc, tensor); -} + // allocate the graph tensors from the previous assignments + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]); + } + ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst); + } -size_t ggml_allocr_max_size(ggml_allocr_t alloc) { - return ggml_tallocr_max_size(alloc->talloc); + return true; } -size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) { - return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph); +size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { + GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers); + + if (galloc->buffers[buffer_id] == NULL) { + return 0; + } + return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } // utils @@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx, return false; } - ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); + struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer); for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { if (t->data == NULL) { if (t->view_src == NULL) { ggml_tallocr_alloc(tallocr, t); - } else { + } else if (t->buffer == NULL) { ggml_backend_view_init(buffer, t); } } else { - if (t->view_src != NULL) { + if (t->view_src != NULL && t->buffer == NULL) { // view of a pre-allocated tensor ggml_backend_view_init(buffer, t); } @@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (this_size > max_size) { - // tensor is too large to fit in a single buffer fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", __func__, t->name, ggml_backend_buft_name(buft), @@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (n_buffers == 0) { - // all the tensors in the context are already allocated #ifndef NDEBUG fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); #endif |