summaryrefslogtreecommitdiff
path: root/ggml/src/ggml-vulkan.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'ggml/src/ggml-vulkan.cpp')
-rw-r--r--ggml/src/ggml-vulkan.cpp965
1 files changed, 669 insertions, 296 deletions
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 6bcd81a7..86732837 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -177,24 +177,33 @@ struct vk_device_struct {
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
+ vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
vk_pipeline pipeline_mul_f32;
vk_pipeline pipeline_div_f32;
- vk_pipeline pipeline_add_f32;
+ vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
+ vk_pipeline pipeline_upscale_f32;
vk_pipeline pipeline_scale_f32;
vk_pipeline pipeline_sqr_f32;
vk_pipeline pipeline_clamp_f32;
+ vk_pipeline pipeline_pad_f32;
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
vk_pipeline pipeline_norm_f32;
+ vk_pipeline pipeline_group_norm_f32;
vk_pipeline pipeline_rms_norm_f32;
vk_pipeline pipeline_gelu_f32;
+ vk_pipeline pipeline_gelu_quick_f32;
vk_pipeline pipeline_silu_f32;
vk_pipeline pipeline_relu_f32;
+ vk_pipeline pipeline_leaky_relu_f32;
+ vk_pipeline pipeline_tanh_f32;
vk_pipeline pipeline_diag_mask_inf_f32;
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
vk_pipeline pipeline_argsort_f32;
vk_pipeline pipeline_sum_rows_f32;
+ vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
+ vk_pipeline pipeline_timestep_embedding_f32;
std::vector<vk_pipeline_ref> pipelines;
@@ -236,8 +245,8 @@ struct vk_device_struct {
};
struct vk_buffer_struct {
- vk::Buffer buffer;
- vk::DeviceMemory device_memory;
+ vk::Buffer buffer = VK_NULL_HANDLE;
+ vk::DeviceMemory device_memory = VK_NULL_HANDLE;
vk::MemoryPropertyFlags memory_property_flags;
void * ptr;
size_t size = 0;
@@ -259,6 +268,10 @@ struct vk_subbuffer {
vk_buffer buffer;
uint64_t offset;
uint64_t size;
+
+ operator vk::DescriptorBufferInfo() const {
+ return { buffer->buffer, offset, size };
+ }
};
struct vk_semaphore {
@@ -320,7 +333,7 @@ struct vk_op_binary_push_constants {
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
uint32_t d_offset;
- float param1; float param2;
+ float param1; float param2; int32_t param3;
};
struct vk_op_diag_mask_push_constants {
@@ -358,6 +371,25 @@ struct vk_op_argsort_push_constants {
int32_t order;
};
+struct vk_op_im2col_push_constants {
+ uint32_t batch_offset; uint32_t offset_delta;
+ uint32_t IC;
+ uint32_t IW; uint32_t IH;
+ uint32_t OW; uint32_t OH;
+ uint32_t KW; uint32_t KH;
+ uint32_t pelements;
+ uint32_t CHW;
+ int32_t s0; int32_t s1;
+ int32_t p0; int32_t p1;
+ int32_t d0; int32_t d1;
+};
+
+struct vk_op_timestep_embedding_push_constants {
+ uint32_t nb1;
+ uint32_t dim;
+ uint32_t max_period;
+};
+
// Allow pre-recording command buffers
struct vk_staging_memcpy {
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
@@ -367,28 +399,32 @@ struct vk_staging_memcpy {
size_t n;
};
-struct vk_context {
- size_t idx;
+struct vk_op_upscale_push_constants {
+ uint32_t ne; uint32_t d_offset;
+ uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
+ uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
+ float sf0; float sf1; float sf2; float sf3;
+};
+struct vk_context_struct {
vk_submission * s;
std::vector<vk_sequence> seqs;
- ggml_tensor * exit_tensor;
+ int exit_tensor_idx;
std::vector<vk_staging_memcpy> in_memcpys;
std::vector<vk_staging_memcpy> out_memcpys;
vk_queue * q;
};
+typedef std::shared_ptr<vk_context_struct> vk_context;
+typedef std::weak_ptr<vk_context_struct> vk_context_ref;
struct ggml_tensor_extra_gpu {
- size_t ctx_idx;
-
vk_buffer_ref buffer_gpu;
uint64_t offset;
void reset() {
- ctx_idx = 0;
buffer_gpu.reset();
offset = 0;
}
@@ -459,8 +495,10 @@ struct ggml_backend_vk_context {
vk_buffer buffer_pool[MAX_VK_BUFFERS];
- vk_context * compute_ctx;
- vk_context * transfer_ctx;
+ vk_context_ref compute_ctx;
+ vk_context_ref transfer_ctx;
+
+ std::vector<vk_context_ref> tensor_ctxs;
};
#ifdef GGML_VULKAN_MEMORY_DEBUG
@@ -510,12 +548,12 @@ static vk_instance_t vk_instance;
static size_t vk_skip_checks;
static size_t vk_output_tensor;
-static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name);
-static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor);
-static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor);
+static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
+static void ggml_vk_check_results_0(ggml_tensor * tensor);
+static void ggml_vk_check_results_1(ggml_tensor * tensor);
#endif
-typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
@@ -708,11 +746,11 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
return s;
}
-static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
- VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
+static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
if (ctx->seqs.empty()) {
return;
}
+ VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
std::vector<std::vector<uint64_t>> tl_wait_vals;
std::vector<std::vector<uint64_t>> tl_signal_vals;
@@ -844,21 +882,17 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
q.stage_flags = stage_flags;
}
-static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
- VK_LOG_DEBUG("ggml_vk_create_context()");
- ctx->gc.contexts.emplace_back();
- vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
- memset((void *) result, 0, sizeof(vk_context));
- result->idx = ctx->gc.contexts.size() - 1;
+static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
+ vk_context result = std::make_shared<vk_context_struct>();
+ VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
+ ctx->gc.contexts.emplace_back(result);
result->q = &q;
return result;
}
-static vk_context * ggml_vk_create_temporary_context(vk_queue& q) {
- VK_LOG_DEBUG("ggml_vk_create_temporary_context()");
- vk_context * result = new vk_context;
- memset((void *) result, 0, sizeof(vk_context));
- result->idx = 0;
+static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
+ vk_context result = std::make_shared<vk_context_struct>();
+ VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
result->q = &q;
return result;
}
@@ -915,6 +949,10 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
+ if (size > device->max_memory_allocation_size) {
+ throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
+ }
+
std::lock_guard<std::mutex> guard(device->mutex);
vk_buffer buf = std::make_shared<vk_buffer_struct>();
@@ -1027,21 +1065,22 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
return { buf, 0, VK_WHOLE_SIZE };
}
-static void ggml_vk_sync_buffers(vk_context * ctx) {
+static void ggml_vk_sync_buffers(vk_context& ctx) {
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
- const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
-
ctx->s->buffer.pipelineBarrier(
ctx->q->stage_flags,
ctx->q->stage_flags,
{},
- mem_barriers,
+ { {
+ {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite},
+ {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}
+ } },
{},
{}
);
}
-static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
+static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
VK_LOG_DEBUG("ggml_vk_wait_events()");
if (events.empty()) {
return;
@@ -1598,6 +1637,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@@ -1605,20 +1645,31 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
-
ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+
+ ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
+
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
@@ -1634,6 +1685,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+
+ ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
+
+ ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
}
static vk_device ggml_vk_get_device(size_t idx) {
@@ -1961,7 +2017,7 @@ void ggml_vk_instance_init() {
// Make sure at least one device exists
if (devices.empty()) {
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// Default to using all dedicated GPUs
@@ -2057,9 +2113,9 @@ void ggml_vk_instance_init() {
}
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
- GGML_ASSERT(idx < vk_instance.device_indices.size());
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
ggml_vk_instance_init();
+ GGML_ASSERT(idx < vk_instance.device_indices.size());
ctx->name = GGML_VK_NAME + std::to_string(idx);
@@ -2077,9 +2133,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
ctx->staging_size = 0;
ctx->staging_offset = 0;
- ctx->compute_ctx = nullptr;
- ctx->transfer_ctx = nullptr;
-
#ifdef GGML_VULKAN_CHECK_RESULTS
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -2112,7 +2165,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
}
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
- VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
return ctx->device->pipeline_matmul_f32;
}
@@ -2126,7 +2179,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
return ctx->device->pipeline_matmul_f16;
}
- GGML_ASSERT(src1_type == GGML_TYPE_F32);
+ if (src1_type != GGML_TYPE_F32) {
+ return nullptr;
+ }
switch (src0_type) {
case GGML_TYPE_Q4_0:
@@ -2370,28 +2425,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
return s;
}
-static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
+
+
+static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
- for (auto& buffer : buffers) {
- std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
+ for (auto& buffer : descriptor_buffer_infos) {
+ std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
}
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
- std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
- std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
- GGML_ASSERT(buffers.size() == pipeline->parameter_count);
- vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
- for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
- descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
- }
- for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
- write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
- }
+ GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
- ctx->device->device.updateDescriptorSets(write_descriptor_sets, {});
+ vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
+ vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
+ ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
@@ -2410,7 +2460,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
s.signal_semaphores = std::move(signal_semaphores);
}
-static void ggml_vk_ctx_end(vk_context * ctx) {
+static void ggml_vk_ctx_end(vk_context& ctx) {
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
if (ctx->s == nullptr) {
return;
@@ -2420,7 +2470,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
ctx->s = nullptr;
}
-static void ggml_vk_ctx_begin(vk_device& device, vk_context * subctx) {
+static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")");
if (subctx->s != nullptr) {
ggml_vk_ctx_end(subctx);
@@ -2453,13 +2503,13 @@ static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
}
}
-static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
+static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
GGML_ASSERT(!ggml_is_contiguous(tensor));
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// Check if src is pinned memory
vk_buffer buf;
@@ -2527,7 +2577,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
staging = ctx->device->sync_staging;
staging_offset = 0;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -2558,12 +2608,12 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
}
}
-static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
+static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// Check if src is pinned memory
vk_buffer buf = nullptr;
@@ -2602,7 +2652,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
staging_buffer = dst->device->sync_staging;
staging_offset = 0;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -2623,7 +2673,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
}
}
-static void ggml_vk_buffer_write_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
+static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, staging_buffer, staging_offset, sync_staging);
}
@@ -2638,7 +2688,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
}
} else {
- vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
ggml_vk_ctx_begin(dst->device, subctx);
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, nullptr, 0, true);
ggml_vk_ctx_end(subctx);
@@ -2650,8 +2700,6 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
dst->device->device.resetFences({ dst->device->fence });
-
- delete subctx;
}
}
@@ -2660,12 +2708,14 @@ static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1);
}
-static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
+static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
GGML_ASSERT(width > 0);
GGML_ASSERT(height > 0);
GGML_ASSERT(src != nullptr);
+ // TODO: staging_offset is not used
+
// Check if dst is pinned memory
vk_buffer buf = nullptr;
size_t buf_offset;
@@ -2704,7 +2754,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
staging_buffer = src->device->sync_staging;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -2714,18 +2764,18 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
}
-static void ggml_vk_buffer_read_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
+static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, staging_buffer, staging_offset, sync_staging);
}
static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
- VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
+ VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
memcpy(dst, (uint8_t *) src->ptr + offset, size);
} else {
- vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, nullptr, 0, true);
ggml_vk_ctx_end(subctx);
@@ -2737,12 +2787,10 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
-
- delete subctx;
}
}
-static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
+static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
// Make sure both buffers are on same device
GGML_ASSERT(src->device == dst->device);
@@ -2756,15 +2804,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
if (src->device == dst->device) {
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
// Copy within the device
- vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
src->device->device.resetFences({ src->device->fence });
-
- delete subctx;
} else {
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
// Copy device to device
@@ -2783,7 +2829,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
- vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
ggml_vk_ctx_begin(dst->device, subctx);
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
ggml_vk_ctx_end(subctx);
@@ -2791,8 +2837,6 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
dst->device->device.resetFences({ dst->device->fence });
-
- delete subctx;
}
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
@@ -2855,7 +2899,7 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
}
static void ggml_vk_matmul(
- ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
+ ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
@@ -2879,7 +2923,7 @@ static void ggml_vk_matmul(
}
static void ggml_vk_matmul_id(
- ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
+ ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
@@ -2913,10 +2957,10 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
}
std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
-static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
+static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
const int tensor_type_size = ggml_type_size(tensor->type);
@@ -2934,7 +2978,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
}
-static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
@@ -3079,7 +3123,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
} else if (qx_needs_dequant) {
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
}
if (y_non_contig) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -3107,7 +3151,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
); // NOLINT
}
-static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
@@ -3268,11 +3312,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
};
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
- { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} },
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
}
-static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
@@ -3340,10 +3384,10 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
// compute
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
}
-static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
@@ -3415,10 +3459,11 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
// compute
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
}
-static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
@@ -3431,7 +3476,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
}
}
-static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
@@ -3499,7 +3544,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
if (mmp == nullptr) {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// Not implemented
@@ -3590,7 +3635,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
} else if (qx_needs_dequant) {
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
}
if (y_non_contig) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -3618,7 +3664,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
); // NOLINT
}
-static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
@@ -3790,11 +3836,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
};
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
- { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } },
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
+ vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
}
-static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
@@ -3803,8 +3850,8 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subct
}
}
-static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
- // guaranteed to be an integer due to the check in ggml_can_repeat
+static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ VK_LOG_DEBUG("ggml_vk_op_repeat(" << src0 << ", " << src1 << ", " << dst << ")");
const uint64_t ne0 = dst->ne[0];
const uint64_t ne1 = dst->ne[1];
const uint64_t ne2 = dst->ne[2];
@@ -3825,6 +3872,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
const uint64_t nb02 = src0->nb[2];
const uint64_t nb03 = src0->nb[3];
+ // guaranteed to be an integer due to the check in ggml_can_repeat
const uint64_t nr0 = ne0/ne00;
const uint64_t nr1 = ne1/ne01;
const uint64_t nr2 = ne2/ne02;
@@ -3852,8 +3900,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
for (uint64_t k1 = 0; k1 < ne01; k1++) {
for (uint64_t i0 = 0; i0 < nr0; i0++) {
copies.push_back({
- src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
- dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
+ src_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
+ dst_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
ne00*nb0,
});
}
@@ -3874,11 +3922,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
switch (op) {
- case GGML_OP_ADD:
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
- return ctx->device->pipeline_add_f32;
- }
- return nullptr;
case GGML_OP_GET_ROWS:
GGML_ASSERT(src1->type == GGML_TYPE_I32);
if (dst->type == GGML_TYPE_F16) {
@@ -3888,6 +3931,14 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_get_rows_f32[src0->type];
}
return nullptr;
+ case GGML_OP_ADD:
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_add_f32;
+ }
+ if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+ return ctx->device->pipeline_add_f16_f32_f16;
+ }
+ return nullptr;
case GGML_OP_MUL:
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_mul_f32;
@@ -3898,6 +3949,22 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_div_f32;
}
return nullptr;
+ case GGML_OP_CONCAT:
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_concat_f32;
+ }
+ if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+ return ctx->device->pipeline_concat_f16;
+ }
+ if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+ return ctx->device->pipeline_concat_i32;
+ }
+ return nullptr;
+ case GGML_OP_UPSCALE:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_upscale_f32;
+ }
+ return nullptr;
case GGML_OP_SCALE:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_scale_f32;
@@ -3913,6 +3980,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_clamp_f32;
}
return nullptr;
+ case GGML_OP_PAD:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_pad_f32;
+ }
+ return nullptr;
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
@@ -3922,6 +3994,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_norm_f32;
}
return nullptr;
+ case GGML_OP_GROUP_NORM:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_group_norm_f32;
+ }
+ return nullptr;
case GGML_OP_RMS_NORM:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_rms_norm_f32;
@@ -3939,11 +4016,21 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_gelu_f32;
}
break;
+ case GGML_UNARY_OP_GELU_QUICK:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_gelu_quick_f32;
+ }
+ break;
case GGML_UNARY_OP_RELU:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_relu_f32;
}
break;
+ case GGML_UNARY_OP_TANH:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_tanh_f32;
+ }
+ break;
default:
break;
}
@@ -3995,6 +4082,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_sum_rows_f32;
}
return nullptr;
+ case GGML_OP_IM2COL:
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_im2col_f32;
+ }
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+ return ctx->device->pipeline_im2col_f32_f16;
+ }
+ return nullptr;
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_timestep_embedding_f32;
+ }
+ return nullptr;
+ case GGML_OP_LEAKY_RELU:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_leaky_relu_f32;
+ }
+ return nullptr;
default:
return nullptr;
}
@@ -4018,9 +4123,12 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
return true;
default:
return false;
@@ -4028,7 +4136,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
}
template<typename PC>
-static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
+static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
if (src1 != nullptr) {
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
@@ -4078,7 +4186,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
std::cerr << " and " << ggml_type_name(src1->type);
}
std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
op_func(ctx, subctx, src0, src1, dst);
@@ -4124,7 +4232,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
vk_buffer d_D = extra->buffer_gpu.lock();
// Workaround for tiny tensor inputs on ROPE
- if (use_src1 && y_sz > d_D->size) {
+ if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
y_sz = VK_WHOLE_SIZE;
}
@@ -4173,13 +4281,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1);
- switch (dst->op) {
+ switch (op) {
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_SOFT_MAX:
case GGML_OP_SUM_ROWS:
- elements = { (uint32_t)ggml_nrows(src0), 1, 1 };
- break;
+ {
+ const uint32_t nr = ggml_nrows(src0);
+ if (nr > 262144) {
+ elements = { 512, 512, CEIL_DIV(nr, 262144) };
+ } else if (nr > 512) {
+ elements = { 512, CEIL_DIV(nr, 512), 1 };
+ } else {
+ elements = { nr, 1, 1 };
+ }
+ } break;
+ case GGML_OP_GROUP_NORM:
+ {
+ const uint32_t num_groups = dst->op_params[0];
+ elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 };
+ } break;
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_ROPE:
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
@@ -4190,6 +4311,49 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
case GGML_OP_ARGSORT:
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
break;
+ case GGML_OP_IM2COL:
+ {
+ const bool is_2D = dst->op_params[6] == 1;
+
+ const uint32_t IC = src1->ne[is_2D ? 2 : 1];
+
+ const uint32_t KH = is_2D ? src0->ne[1] : 1;
+ const uint32_t KW = src0->ne[0];
+
+ const uint32_t OH = is_2D ? dst->ne[2] : 1;
+ const uint32_t OW = dst->ne[1];
+
+ const uint32_t batch = src1->ne[3];
+
+ elements = { OW * KW * KH, OH, batch * IC };
+ } break;
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ {
+ const uint32_t dim = dst->op_params[0];
+ uint32_t half_ceil = (dim + 1) / 2;
+ elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
+ } break;
+ case GGML_OP_ADD:
+ case GGML_OP_DIV:
+ case GGML_OP_MUL:
+ case GGML_OP_SCALE:
+ case GGML_OP_SQR:
+ case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
+ case GGML_OP_CPY:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
+ case GGML_OP_UNARY:
+ {
+ const uint32_t ne = ggml_nelements(dst);
+ if (ne > 262144) {
+ elements = { 512, 512, CEIL_DIV(ne, 262144) };
+ } else if (ne > 512) {
+ elements = { 512, CEIL_DIV(ne, 512), 1 };
+ } else {
+ elements = { ne, 1, 1 };
+ }
+ } break;
default:
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
break;
@@ -4216,31 +4380,35 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
if (use_src1) {
subbuf_y = { d_Y, y_buf_offset, y_sz };
} else {
- subbuf_y = { d_X, 0, d_X->size };
+ subbuf_y = { d_X, 0, x_sz };
}
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (op == GGML_OP_ROPE) {
// Empty src2 is possible in rope, but the shader needs a buffer
vk_subbuffer subbuf_z;
if (use_src2) {
subbuf_z = { d_Z, z_buf_offset, z_sz };
} else {
- subbuf_z = { d_X, 0, d_X->size };
+ subbuf_z = { d_X, 0, x_sz };
}
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ } else if (op == GGML_OP_IM2COL) {
+ // im2col uses only src1 and dst buffers
+ ggml_vk_sync_buffers(subctx);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (use_src2) {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (use_src1) {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
}
} else {
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
@@ -4249,8 +4417,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, ne02 * ne03);
- switch (dst->op) {
+ switch (op) {
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
elements = { (uint32_t)ne01, 1, 1 };
break;
@@ -4276,21 +4445,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
if (use_src1) {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
} else {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
}
}
}
}
}
-static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
+static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {});
}
-static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4301,11 +4470,11 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
- 0.0f, 0.0f,
+ 0.0f, 0.0f, 0,
});
}
-static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4316,11 +4485,11 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
- 0.0f, 0.0f,
+ 0.0f, 0.0f, 0,
});
}
-static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4331,11 +4500,11 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
- 0.0f, 0.0f,
+ 0.0f, 0.0f, 0,
});
}
-static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4346,11 +4515,44 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, cons
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
- 0.0f, 0.0f,
+ 0.0f, 0.0f, 0,
});
}
-static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ int * op_params = (int *)dst->op_params;
+
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
+ const uint32_t src1_type_size = ggml_type_size(src1->type);
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONCAT, {
+ (uint32_t)ggml_nelements(dst),
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+ (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
+ 0,
+ 0.0f, 0.0f, op_params[0],
+ });
+}
+
+static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
+
+ const float sf0 = (float)dst->ne[0] / src0->ne[0];
+ const float sf1 = (float)dst->ne[1] / src0->ne[1];
+ const float sf2 = (float)dst->ne[2] / src0->ne[2];
+ const float sf3 = (float)dst->ne[3] / src0->ne[3];
+
+ ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
+ (uint32_t)ggml_nelements(dst), 0,
+ (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+ (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
+ sf0, sf1, sf2, sf3,
+ });
+}
+
+static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4364,7 +4566,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
});
}
-static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4377,7 +4579,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
});
}
-static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4391,7 +4593,20 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
});
}
-static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
+ (uint32_t)ggml_nelements(dst),
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
+ 0,
+ 0.0f, 0.0f,
+ });
+}
+
+static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4406,27 +4621,37 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
});
}
-static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
}
-static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ int * op_params = (int *)dst->op_params;
+
+ uint32_t num_groups = op_params[0];
+ uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+ static const float eps = 1e-6f;
+
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f });
+}
+
+static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
}
-static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
}
-static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
int32_t * op_params = (int32_t *)dst->op_params;
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
}
-static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
float scale = op_params[0];
@@ -4451,7 +4676,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
});
}
-static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const int n_dims = ((int32_t *) dst->op_params)[1];
// const int mode = ((int32_t *) dst->op_params)[2];
// const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -4475,7 +4700,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
});
}
-static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
int32_t * op_params = (int32_t *)dst->op_params;
uint32_t ncols = src0->ne[0];
@@ -4494,10 +4719,59 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
});
}
-static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f });
}
+static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ const int32_t s0 = dst->op_params[0];
+ const int32_t s1 = dst->op_params[1];
+ const int32_t p0 = dst->op_params[2];
+ const int32_t p1 = dst->op_params[3];
+ const int32_t d0 = dst->op_params[4];
+ const int32_t d1 = dst->op_params[5];
+
+ const bool is_2D = dst->op_params[6] == 1;
+
+ const uint32_t IC = src1->ne[is_2D ? 2 : 1];
+ const uint32_t IH = is_2D ? src1->ne[1] : 1;
+ const uint32_t IW = src1->ne[0];
+
+ const uint32_t KH = is_2D ? src0->ne[1] : 1;
+ const uint32_t KW = src0->ne[0];
+
+ const uint32_t OH = is_2D ? dst->ne[2] : 1;
+ const uint32_t OW = dst->ne[1];
+
+ const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+ const uint32_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
+
+ const uint32_t pelements = OW * KW * KH;
+
+ ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, {
+ batch_offset, offset_delta,
+ IC, IW, IH, OW, OH, KW, KH,
+ pelements,
+ IC * KH * KW,
+ s0, s1, p0, p1, d0, d1,
+ });
+}
+
+static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ const uint32_t dim = dst->op_params[0];
+ const uint32_t max_period = dst->op_params[1];
+ const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
+
+ ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
+ nb1, dim, max_period,
+ });
+}
+
+static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ const float * op_params = (const float *)dst->op_params;
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f });
+}
+
#ifdef GGML_VULKAN_RUN_TESTS
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
@@ -4521,7 +4795,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
} else if (type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@@ -4555,7 +4829,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_s;
shname = "F16_ALIGNED_S";
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else if (shader_size == 1) {
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@@ -4571,7 +4845,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_m;
shname = "F16_ALIGNED_M";
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else if (shader_size == 2) {
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@@ -4587,7 +4861,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_l;
shname = "F16_ALIGNED_L";
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else {
GGML_ASSERT(0);
@@ -4668,7 +4942,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
for (size_t i = 0; i < y_ne; i++) {
@@ -4679,14 +4953,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
// y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
for (size_t i = 0; i < num_it; i++) {
ggml_vk_ctx_begin(ctx->device, subctx);
ggml_vk_matmul(
@@ -4727,14 +5001,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
src0_type = GGML_TYPE_F16;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (std::is_same<float, Y_TYPE>()) {
src1_type = GGML_TYPE_F32;
} else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
src1_type = GGML_TYPE_F16;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
@@ -4841,7 +5115,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
} else if (tensor->type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@@ -4894,7 +5168,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ggml_vk_ctx_begin(ctx->device, subctx);
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
@@ -5027,7 +5301,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
for (size_t i = 0; i < num_it; i++) {
ggml_vk_ctx_begin(ctx->device, subctx);
ggml_vk_matmul(
@@ -5175,7 +5449,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
- bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
+ bool mmp = (use_src0 && use_src1 && (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID)) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
@@ -5211,24 +5485,33 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_MUL:
case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_TANH:
break;
default:
return;
@@ -5236,6 +5519,13 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
break;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
+ if (
+ x_sz > ctx->device->max_memory_allocation_size ||
+ y_sz > ctx->device->max_memory_allocation_size ||
+ d_sz > ctx->device->max_memory_allocation_size ||
+ split_k_size > ctx->device->max_memory_allocation_size) {
+ GGML_ABORT("Requested preallocation size is too large");
+ }
if (ctx->prealloc_size_x < x_sz) {
ctx->prealloc_size_x = x_sz;
}
@@ -5391,7 +5681,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
std::cerr << std::endl;
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
#endif
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
@@ -5430,7 +5720,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
}
}
-static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
+static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node){
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
if (ggml_is_empty(node) || extra == nullptr) {
@@ -5457,7 +5747,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_TANH:
break;
default:
return;
@@ -5468,13 +5760,17 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
@@ -5483,109 +5779,147 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_MUL_MAT_ID:
case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
break;
default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
return;
}
- if (ctx->compute_ctx == nullptr) {
- ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
- ggml_vk_ctx_begin(ctx->device, ctx->compute_ctx);
+ vk_context compute_ctx;
+
+ if (ctx->compute_ctx.expired()) {
+ compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ ctx->compute_ctx = compute_ctx;
+ ggml_vk_ctx_begin(ctx->device, compute_ctx);
+ } else {
+ compute_ctx = ctx->compute_ctx.lock();
}
switch (node->op) {
case GGML_OP_REPEAT:
- ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_repeat(ctx, compute_ctx, src0, node);
break;
case GGML_OP_GET_ROWS:
- ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_ADD:
- ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_add(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_MUL:
- ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_DIV:
- ggml_vk_div(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_div(ctx, compute_ctx, src0, src1, node);
+
+ break;
+ case GGML_OP_CONCAT:
+ ggml_vk_concat(ctx, compute_ctx, src0, src1, node);
+
+ break;
+ case GGML_OP_UPSCALE:
+ ggml_vk_upscale(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SCALE:
- ggml_vk_scale(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_scale(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SQR:
- ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_sqr(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CLAMP:
- ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_clamp(ctx, compute_ctx, src0, node);
+
+ break;
+ case GGML_OP_PAD:
+ ggml_vk_pad(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
- ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_cpy(ctx, compute_ctx, src0, node);
break;
case GGML_OP_NORM:
- ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_norm(ctx, compute_ctx, src0, node);
+
+ break;
+ case GGML_OP_GROUP_NORM:
+ ggml_vk_group_norm(ctx, compute_ctx, src0, node);
break;
case GGML_OP_RMS_NORM:
- ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_rms_norm(ctx, compute_ctx, src0, node);
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
- ggml_vk_unary(ctx, ctx->compute_ctx, src0, node);
+ case GGML_UNARY_OP_TANH:
+ ggml_vk_unary(ctx, compute_ctx, src0, node);
break;
default:
return;
}
break;
case GGML_OP_DIAG_MASK_INF:
- ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SOFT_MAX:
- ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_ROPE:
- ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
+ ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node);
break;
case GGML_OP_ARGSORT:
- ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_argsort(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SUM_ROWS:
- ggml_vk_sum_rows(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_sum_rows(ctx, compute_ctx, src0, node);
+
+ break;
+ case GGML_OP_IM2COL:
+ ggml_vk_im2col(ctx, compute_ctx, src0, src1, node);
+
+ break;
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
+
+ break;
+ case GGML_OP_LEAKY_RELU:
+ ggml_vk_leaky_relu(ctx, compute_ctx, src0, node);
break;
case GGML_OP_MUL_MAT:
- ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_MUL_MAT_ID:
- ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, src2, node);
+ ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node);
break;
default:
return;
}
- extra->ctx_idx = ctx->compute_ctx->idx;
+ ctx->tensor_ctxs[node_idx] = compute_ctx;
#ifdef GGML_VULKAN_CHECK_RESULTS
// Force context reset on each node so that each tensor ends up in its own context
@@ -5594,13 +5928,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
#endif
if (last_node) {
- ggml_vk_ctx_end(ctx->compute_ctx);
- ctx->compute_ctx->exit_tensor = node;
- ctx->compute_ctx = nullptr;
+ ggml_vk_ctx_end(compute_ctx);
+ compute_ctx->exit_tensor_idx = node_idx;
+ ctx->compute_ctx.reset();
}
}
-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
ggml_tensor_extra_gpu * extra = nullptr;
switch (tensor->op) {
@@ -5608,13 +5942,17 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_GET_ROWS:
case GGML_OP_MUL:
case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
@@ -5626,6 +5964,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_NONE:
case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
+ case GGML_OP_REPEAT:
extra = (ggml_tensor_extra_gpu *) tensor->extra;
break;
@@ -5633,7 +5975,9 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
switch (ggml_get_unary_op(tensor)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_TANH:
extra = (ggml_tensor_extra_gpu *) tensor->extra;
break;
default:
@@ -5656,31 +6000,31 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
#ifdef GGML_VULKAN_CHECK_RESULTS
- ggml_vk_check_results_0(ctx, tensor);
+ ggml_vk_check_results_0(tensor);
#endif
- vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
+ vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
// Only run if ctx hasn't been submitted yet
- if (!subctx.seqs.empty()) {
+ if (!subctx->seqs.empty()) {
// Do staging buffer copies
- for (auto& cpy : subctx.in_memcpys) {
+ for (auto& cpy : subctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
- ggml_vk_submit(&subctx, ctx->fence);
+ ggml_vk_submit(subctx, ctx->fence);
}
- if (tensor == subctx.exit_tensor) {
+ if (tensor_idx == subctx->exit_tensor_idx) {
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
ctx->device->device.resetFences({ ctx->fence });
// Do staging buffer copies
- for (auto& cpy : subctx.out_memcpys) {
+ for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
- subctx.in_memcpys.clear();
- subctx.out_memcpys.clear();
+ subctx->in_memcpys.clear();
+ subctx->out_memcpys.clear();
}
return true;
@@ -5725,8 +6069,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
ctx->staging_offset = 0;
- ctx->compute_ctx = nullptr;
- ctx->transfer_ctx = nullptr;
+ ctx->tensor_ctxs.clear();
ctx->gc.contexts.clear();
}
@@ -6063,15 +6406,20 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
- if (ctx->transfer_ctx == nullptr) {
+ vk_context transfer_ctx;
+
+ if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ ctx->transfer_ctx = transfer_ctx;
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+ } else {
+ transfer_ctx = ctx->transfer_ctx.lock();
}
vk_buffer buf = extra->buffer_gpu.lock();
- ggml_vk_buffer_write_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
+ ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
}
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6081,15 +6429,20 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
- if (ctx->transfer_ctx == nullptr) {
+ vk_context transfer_ctx;
+
+ if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ ctx->transfer_ctx = transfer_ctx;
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+ } else {
+ transfer_ctx = ctx->transfer_ctx.lock();
}
vk_buffer buf = extra->buffer_gpu.lock();
- ggml_vk_buffer_read_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
+ ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
}
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6099,16 +6452,21 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
- if (ctx->transfer_ctx == nullptr) {
+ vk_context transfer_ctx;
+
+ if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ ctx->transfer_ctx = transfer_ctx;
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+ } else {
+ transfer_ctx = ctx->transfer_ctx.lock();
}
vk_buffer src_buf = src_extra->buffer_gpu.lock();
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
+ ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
return true;
}
@@ -6118,25 +6476,27 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
- if(ctx->transfer_ctx == nullptr) {
+ if(ctx->transfer_ctx.expired()) {
return;
}
- ggml_vk_ctx_end(ctx->transfer_ctx);
+ vk_context transfer_ctx = ctx->transfer_ctx.lock();
+
+ ggml_vk_ctx_end(transfer_ctx);
- for (auto& cpy : ctx->transfer_ctx->in_memcpys) {
+ for (auto& cpy : transfer_ctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
- ggml_vk_submit(ctx->transfer_ctx, ctx->fence);
+ ggml_vk_submit(transfer_ctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
ctx->device->device.resetFences({ ctx->fence });
- for (auto& cpy : ctx->transfer_ctx->out_memcpys) {
+ for (auto& cpy : transfer_ctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
- ctx->transfer_ctx = nullptr;
+ ctx->transfer_ctx.reset();
}
static bool ggml_vk_is_empty(ggml_tensor * node) {
@@ -6159,8 +6519,11 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
last_node -= 1;
}
+ // Reserve tensor context space for all nodes
+ ctx->tensor_ctxs.resize(cgraph->n_nodes);
+
for (int i = 0; i < cgraph->n_nodes; i++) {
- ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node);
+ ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node);
}
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6170,13 +6533,17 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
continue;
}
- bool ok = ggml_vk_compute_forward(ctx, node);
+ bool ok = ggml_vk_compute_forward(ctx, node, i);
if (!ok) {
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+ if (node->op == GGML_OP_UNARY) {
+ std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
+ } else {
+ std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+ }
}
#ifdef GGML_VULKAN_CHECK_RESULTS
else {
- ggml_vk_check_results_1(ctx, node);
+ ggml_vk_check_results_1(node);
}
#endif
GGML_ASSERT(ok);
@@ -6196,8 +6563,10 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_TANH:
return ggml_is_contiguous(op->src[0]);
default:
return false;
@@ -6270,11 +6639,11 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
}
return false;
} break;
- // case GGML_OP_REPEAT:
- // {
- // ggml_type src0_type = op->src[0]->type;
- // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
- // } break;
+ case GGML_OP_REPEAT:
+ {
+ ggml_type src0_type = op->src[0]->type;
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+ } break;
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_NONE:
@@ -6283,18 +6652,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
+ case GGML_OP_RMS_NORM:
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
- case GGML_OP_RMS_NORM:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
return true;
default:
return false;
@@ -6498,7 +6874,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
} else if (tensor->type == GGML_TYPE_I32) {
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@@ -6509,10 +6885,12 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
}
}
-static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
+static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
void * tensor_data = tensor->data;
- if (ggml_backend_buffer_is_vk(tensor->buffer)) {
+ const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer);
+
+ if (is_gpu) {
const size_t tensor_size = ggml_nbytes(tensor);
tensor_data = malloc(tensor_size);
@@ -6533,13 +6911,10 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
std::cerr << std::endl << "Result:" << std::endl;
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
std::cerr << std::endl;
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
- std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
- if (ggml_backend_buffer_is_vk(tensor->buffer)) {
+ if (is_gpu) {
free(tensor_data);
}
}
@@ -6548,8 +6923,8 @@ void * comp_result;
size_t comp_size;
size_t comp_nb[GGML_MAX_DIMS];
size_t check_counter = 0;
-static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor) {
- if (tensor->op == GGML_OP_TRANSPOSE) {
+static void ggml_vk_check_results_0(ggml_tensor * tensor) {
+ if (tensor->op == GGML_OP_TRANSPOSE) {
return;
}
@@ -6565,7 +6940,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
ggml_tensor * src2 = tensor->src[2];
struct ggml_init_params iparams = {
- /*.mem_size =*/ 1024*1024*1024,
+ /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
@@ -6620,11 +6995,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
- ggml_vk_print_tensor(ctx, src0, "src0");
+ ggml_vk_print_tensor(src0, "src0");
}
}
if (src1 != nullptr) {
@@ -6662,27 +7037,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
- ggml_vk_print_tensor(ctx, src1, "src1");
- std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
- std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
- if (src1->src[0] != nullptr) {
- std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
- }
- if (src1->src[1] != nullptr) {
- std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
- }
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
- std::cerr << std::endl;
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0);
- std::cerr << std::endl;
- std::vector<const ggml_tensor *> done;
- ggml_vk_print_graph_origin(src1_clone, done);
+ ggml_vk_print_tensor(src1, "src1");
}
}
if (src2 != nullptr) {
@@ -6720,27 +7079,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
- ggml_vk_print_tensor(ctx, src2, "src2");
- std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
- std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
- if (src2->src[0] != nullptr) {
- std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
- }
- if (src2->src[1] != nullptr) {
- std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
- }
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
- std::cerr << std::endl;
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
- std::cerr << std::endl;
- std::vector<const ggml_tensor *> done;
- ggml_vk_print_graph_origin(src2_clone, done);
+ ggml_vk_print_tensor(src2, "src2");
}
}
@@ -6752,16 +7095,24 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone);
} else if (tensor->op == GGML_OP_DIV) {
tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone);
+ } else if (tensor->op == GGML_OP_CONCAT) {
+ tensor_clone = ggml_concat(ggml_ctx, src0_clone, src1_clone, *(int *)tensor->op_params);
+ } else if (tensor->op == GGML_OP_UPSCALE) {
+ tensor_clone = ggml_upscale_ext(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
} else if (tensor->op == GGML_OP_SCALE) {
tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
} else if (tensor->op == GGML_OP_SQR) {
tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_CLAMP) {
tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
+ } else if (tensor->op == GGML_OP_PAD) {
+ tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]);
} else if (tensor->op == GGML_OP_ADD) {
tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
} else if (tensor->op == GGML_OP_NORM) {
tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
+ } else if (tensor->op == GGML_OP_GROUP_NORM) {
+ tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params);
} else if (tensor->op == GGML_OP_RMS_NORM) {
tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
} else if (tensor->op == GGML_OP_SOFT_MAX) {
@@ -6777,12 +7128,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
const int mode = ((int32_t *) tensor->op_params)[2];
//const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
- float freq_base = ((float *) tensor->op_params)[5];
- float freq_scale = ((float *) tensor->op_params)[6];
- float ext_factor = ((float *) tensor->op_params)[7];
- float attn_factor = ((float *) tensor->op_params)[8];
- float beta_fast = ((float *) tensor->op_params)[9];
- float beta_slow = ((float *) tensor->op_params)[10];
+ const float freq_base = ((float *) tensor->op_params)[5];
+ const float freq_scale = ((float *) tensor->op_params)[6];
+ const float ext_factor = ((float *) tensor->op_params)[7];
+ const float attn_factor = ((float *) tensor->op_params)[8];
+ const float beta_fast = ((float *) tensor->op_params)[9];
+ const float beta_slow = ((float *) tensor->op_params)[10];
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
} else if (tensor->op == GGML_OP_UNARY) {
switch (ggml_get_unary_op(tensor)) {
@@ -6792,12 +7143,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_UNARY_OP_GELU:
tensor_clone = ggml_gelu(ggml_ctx, src0_clone);
break;
+ case GGML_UNARY_OP_GELU_QUICK:
+ tensor_clone = ggml_gelu_quick(ggml_ctx, src0_clone);
+ break;
case GGML_UNARY_OP_RELU:
tensor_clone = ggml_relu(ggml_ctx, src0_clone);
break;
+ case GGML_UNARY_OP_TANH:
+ tensor_clone = ggml_tanh(ggml_ctx, src0_clone);
+ break;
default:
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
if (src1 == nullptr) {
@@ -6823,9 +7180,26 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params);
} else if (tensor->op == GGML_OP_SUM_ROWS) {
tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
+ } else if (tensor->op == GGML_OP_IM2COL) {
+ const int32_t s0 = tensor->op_params[0];
+ const int32_t s1 = tensor->op_params[1];
+ const int32_t p0 = tensor->op_params[2];
+ const int32_t p1 = tensor->op_params[3];
+ const int32_t d0 = tensor->op_params[4];
+ const int32_t d1 = tensor->op_params[5];
+
+ const bool is_2D = tensor->op_params[6] == 1;
+ tensor_clone = ggml_im2col(ggml_ctx, src0_clone, src1_clone, s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
+ } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
+ const int32_t dim = tensor->op_params[0];
+ const int32_t max_period = tensor->op_params[1];
+ tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
+ } else if (tensor->op == GGML_OP_LEAKY_RELU) {
+ const float * op_params = (const float *)tensor->op_params;
+ tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
} else {
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
@@ -6834,7 +7208,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
- ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
+ ggml_vk_print_tensor(tensor_clone, "tensor_clone");
}
comp_size = ggml_nbytes(tensor_clone);
@@ -6851,9 +7225,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
}
ggml_free(ggml_ctx);
+
+ VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")");
}
-static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor) {
+static void ggml_vk_check_results_1(ggml_tensor * tensor) {
if (tensor->op == GGML_OP_TRANSPOSE) {
return;
}
@@ -6912,7 +7288,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
}
} else {
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
@@ -6935,7 +7311,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
first_error[0] = i0;
@@ -6977,11 +7353,6 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl << "Correct:" << std::endl;
ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0);
std::cerr << std::endl;
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
- std::cerr << std::endl << "Correct:" << std::endl;
- ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0);
- std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
}
@@ -7006,7 +7377,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
} else {
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
}
@@ -7018,5 +7389,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
free(tensor_data);
}
+
+ VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
}
#endif