diff options
author | slaren <slarengh@gmail.com> | 2024-01-12 20:07:38 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-12 20:07:38 +0100 |
commit | e7e4df031b9e29d4b55a4e0b0295187f6b213db1 (patch) | |
tree | 93211b7800be3c2c5f9eb1d55f3b7b3acdc56c9b /ggml-alloc.c | |
parent | 584d674be622fbf1578694ada6e62eebedbfd377 (diff) |
llama : ggml-backend integration (#4766)
* llama : ggml-backend integration
* ggml-backend : add names to buffers
* fix unmap after loading
* batched-bench : add tensor_split param
* llama : check for null tensor_split
* ggml-backend : increase GGML_MAX_BACKENDS
* improve graph splitting, partial fix for --no-kv-offload
* cuda : add ggml-backend split buffer support
* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)
* ggml : fix null backend dereference (#4807)
* ggml : fix null backend dereference
* ggml : also check ggml_backend_is_cpu
* test-backend-ops : check buffer allocation failures
* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)
* ggml : fix mul_mat_id work size
* llama : rewrite session kv load/set without graphs
* minor
* llama : only initialize used backends, free backends on context free
* llama : abort ctx if cuda backend init fails
* llama : rewrite lora with ggml-backend and compute on CPU
ggml-ci
* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer
* opencl : add ggml-backend buffer type
* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)
* llama : on Metal, by default offload the full model
ggml-ci
* metal : page align the data ptr (#4854)
* Apply suggestions from code review
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
* cuda : fix split buffer free
* address review comments
* llama-bench : add split-mode parameter
* fix whitespace
* opencl : fix double initialization
* server : add --split-mode parameter
* use async copy and compute to improve multi-gpu performance
ggml-ci
* use async memcpys to copy the graph outputs to the CPU
* fix opencl
* use a host buffer for the cpu compute buffer for faster copies to the gpu
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Diffstat (limited to 'ggml-alloc.c')
-rw-r--r-- | ggml-alloc.c | 34 |
1 files changed, 28 insertions, 6 deletions
diff --git a/ggml-alloc.c b/ggml-alloc.c index a27dd54b..89b85d34 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { } } - AT_PRINTF("block %d\n", best_fit_block); - if (best_fit_block == -1) { // the last block is our last resort struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; @@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { return; } } + struct free_block * block = &alloc->free_blocks[best_fit_block]; void * addr = block->addr; block->addr = (char*)block->addr + size; @@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { } } + AT_PRINTF("block %d, addr %p\n", best_fit_block, addr); + tensor->data = addr; tensor->buffer = alloc->buffer; if (!alloc->measure) { @@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) { alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows } else { alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; + ggml_backend_buffer_reset(alloc->buffer); } } @@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) { return alloc; } -ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { +ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) { // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1); + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1); // TODO: move alloc initialization to a common ggml_tallocr_new_impl function ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); @@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe return alloc; } -ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { - ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size); +ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { + return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend)); +} + +ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) { + // create a backend buffer to get the correct tensor allocation sizes + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); alloc->buffer_owned = true; return alloc; } +ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { + return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size); +} + ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) { ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); @@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (nbytes == 0) { // all the tensors in the context are already allocated +#ifndef NDEBUG + fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); +#endif return NULL; } ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes); + if (buffer == NULL) { + // failed to allocate buffer +#ifndef NDEBUG + fprintf(stderr, "%s: failed to allocate buffer\n", __func__); +#endif + return NULL; + } + ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { |