From ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 29 Mar 2024 17:29:21 +0100 Subject: Vulkan k-quant mmq and ggml-backend offload functionality (#6155) * Fix Vulkan no kv offload incoherence * Add k-quant mul mat mat shaders * Rework working buffer allocation, reduces vram use noticeably Clean up cpu assist code, replaced with ggml-backend offload function * Default to all dedicated GPUs * Add fallback for integrated GPUs if no dedicated GPUs are found * Add debug info which device is allocating memory * Fix Intel dequant issue Fix validation issue * Fix Vulkan GGML_OP_GET_ROWS implementation * Clean up merge artifacts * Remove Vulkan warning --- llama.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 97408ba1..21e7a067 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2121,10 +2121,6 @@ struct llama_context { ggml_backend_free(backend); } -#ifdef GGML_USE_VULKAN - ggml_vk_free_cpu_assist(); -#endif - ggml_backend_buffer_free(buf_output); } @@ -14131,7 +14127,20 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_VULKAN) - if (model->n_gpu_layers > 0) { + if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { + LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); + llama_free(ctx); + return nullptr; + } + if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { + ggml_backend_t backend = ggml_backend_vk_init(0); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_vk_init(device); if (backend == nullptr) { -- cgit v1.2.3