summaryrefslogtreecommitdiff
path: root/ggml-alloc.c
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-12-07 22:26:54 +0200
committerGitHub <noreply@github.com>2023-12-07 22:26:54 +0200
commitfe680e3d1080a765e5d3150ffd7bab189742898d (patch)
treecd8be8bf5722d10596923aef7fb44bf8a58378d7 /ggml-alloc.c
parentbcc0eb4591bec5ec02fad3f2bdcb1b265052ea56 (diff)
sync : ggml (new ops, tests, backend, etc.) (#4359)
* sync : ggml (part 1) * sync : ggml (part 2, CUDA) * sync : ggml (part 3, Metal) * ggml : build fixes ggml-ci * cuda : restore lost changes * cuda : restore lost changes (StableLM rope) * cmake : enable separable compilation for CUDA ggml-ci * ggml-cuda : remove device side dequantize * Revert "cmake : enable separable compilation for CUDA" This reverts commit 09e35d04b1c4ca67f9685690160b35bc885a89ac. * cuda : remove assert for rope * tests : add test-backend-ops * ggml : fix bug in ggml_concat * ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()` * ci : try to fix macOS * ggml-backend : remove backend self-registration * ci : disable Metal for macOS cmake build ggml-ci * metal : fix "supports family" call * metal : fix assert * metal : print resource path ggml-ci --------- Co-authored-by: slaren <slarengh@gmail.com>
Diffstat (limited to 'ggml-alloc.c')
-rw-r--r--ggml-alloc.c49
1 files changed, 42 insertions, 7 deletions
diff --git a/ggml-alloc.c b/ggml-alloc.c
index 0d4e12ae..d3049efb 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
- if (!alloc->measure) {
- ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
- }
-
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, tensor);
#endif
@@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
}
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
- struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
+ struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
@@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
ggml_tallocr_t alloc = node_tallocr(galloc, view);
- //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
if (update_backend) {
view->backend = view->view_src->backend;
@@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
- assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
+ assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
if (!alloc->measure) {
ggml_backend_buffer_init_tensor(alloc->buffer, view);
@@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
}
+
+// utils
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
+
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
+
+ size_t nbytes = 0;
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+ if (t->data == NULL && t->view_src == NULL) {
+ nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
+ }
+ }
+
+ if (nbytes == 0) {
+ fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+ return NULL;
+ }
+
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
+ ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
+
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+ if (t->data == NULL) {
+ if (t->view_src == NULL) {
+ ggml_tallocr_alloc(tallocr, t);
+ } else {
+ ggml_backend_view_init(buffer, t);
+ }
+ }
+ }
+
+ ggml_tallocr_free(tallocr);
+
+ return buffer;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
+ return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
+}