summaryrefslogtreecommitdiff
path: root/ggml-alloc.c
diff options
context:
space:
mode:
authorslaren <slarengh@gmail.com>2023-12-21 21:07:46 +0100
committerGitHub <noreply@github.com>2023-12-21 21:07:46 +0100
commitd232aca5a73b290e218a2e48b91023d5e994203f (patch)
treee763648880fad8ef44be54c9cb59c9c7dbda4168 /ggml-alloc.c
parent31f27758faf4a4bd08101a57c7ec3a473f771f86 (diff)
llama : initial ggml-backend integration (#4520)
* llama : initial ggml-backend integration * add ggml-metal * cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST access all tensor data with ggml_backend_tensor_get/set * add ggml_backend_buffer_clear zero-init KV cache buffer * add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data * disable gpu backends with ngl 0 * more accurate mlock * unmap offloaded part of the model * use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap * update quantize and lora * update session copy/set to use ggml-backend ggml-ci * use posix_fadvise instead of posix_fadvise64 * ggml_backend_alloc_ctx_tensors_from_buft : remove old print * llama_mmap::align_offset : use pointers instead of references for out parameters * restore progress_callback behavior * move final progress_callback call to load_all_data * cuda : fix fprintf format string (minor) * do not offload scales * llama_mmap : avoid unmapping the same fragments again in the destructor * remove unnecessary unmap * metal : add default log function that prints to stderr, cleanup code ggml-ci --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'ggml-alloc.c')
-rw-r--r--ggml-alloc.c16
1 files changed, 12 insertions, 4 deletions
diff --git a/ggml-alloc.c b/ggml-alloc.c
index d3049efb..a97436b1 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
if (update_backend) {
view->backend = view->view_src->backend;
}
- view->buffer = view->view_src->buffer;
+ // views are initialized in the alloc buffer rather than the view_src buffer
+ view->buffer = alloc->buffer;
view->data = (char *)view->view_src->data + view->view_offs;
- // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
- // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
}
void ggml_allocr_free(ggml_allocr_t alloc) {
+ if (alloc == NULL) {
+ return;
+ }
+
ggml_gallocr_free(alloc->galloc);
ggml_tallocr_free(alloc->talloc);
free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
}
if (nbytes == 0) {
- fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+ // all the tensors in the context are already allocated
return NULL;
}
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
} else {
ggml_backend_view_init(buffer, t);
}
+ } else {
+ if (t->view_src != NULL) {
+ // view of a pre-allocated tensor
+ ggml_backend_view_init(buffer, t);
+ }
}
}