diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-08-12 15:14:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-08-12 15:14:32 +0200 |
commit | 8f43e551038af2547b5c01d0e9edd641c0e4bd29 (patch) | |
tree | 07a4373620a9381d0b5c7189a475990a6feb48a5 /ggml/src | |
parent | f5d1af61d79fb53ccfbac2e665e43208c07b083d (diff) |
Merge mainline - Aug 12 2024 (#17)
* Merge mainline
* Fix after merge
* Remove CI check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src')
87 files changed, 3485 insertions, 1484 deletions
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 0b1ad48a..be6c2cc6 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -139,6 +139,17 @@ if (GGML_METAL) ) endif() +if (GGML_MUSA) + set(CMAKE_C_COMPILER clang) + set(CMAKE_C_EXTENSIONS OFF) + set(CMAKE_CXX_COMPILER clang++) + set(CMAKE_CXX_EXTENSIONS OFF) + + set(GGML_CUDA ON) + + list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA) +endif() + if (GGML_OPENMP) find_package(OpenMP) if (OpenMP_FOUND) @@ -147,6 +158,11 @@ if (GGML_OPENMP) add_compile_definitions(GGML_USE_OPENMP) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + + if (GGML_MUSA) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp") + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so") + endif() else() message(WARNING "OpenMP not found") endif() @@ -257,11 +273,16 @@ endif() if (GGML_CUDA) cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES - find_package(CUDAToolkit) - - set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0) - set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0) - set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_OBJECTS 0) + if (GGML_MUSA) + list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/") + find_package(MUSAToolkit) + set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND}) + else() + find_package(CUDAToolkit) + set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0) + set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0) + set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_OBJECTS 0) + endif() if (CUDAToolkit_FOUND) message(STATUS "CUDA found") @@ -280,7 +301,11 @@ if (GGML_CUDA) endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - enable_language(CUDA) + if (GGML_MUSA) + set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE}) + else() + enable_language(CUDA) + endif() file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh") list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h") @@ -344,21 +369,40 @@ if (GGML_CUDA) add_compile_definitions(GGML_CUDA_NO_PEER_COPY) endif() + if (GGML_MUSA) + set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX) + foreach(SOURCE ${GGML_SOURCES_CUDA}) + set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22") + endforeach() + endif() + if (GGML_STATIC) if (WIN32) # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) else () - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + if (GGML_MUSA) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static) + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() endif() else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + if (GGML_MUSA) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas) + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() endif() if (GGML_CUDA_NO_VMM) # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + if (GGML_MUSA) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ... + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + endif() endif() else() message(WARNING "CUDA not found") @@ -816,11 +860,6 @@ if (GGML_CANN) ${CANN_INSTALL_DIR}/acllib/include ) - # TODO: find libs - link_directories( - ${CANN_INSTALL_DIR}/lib64 - ) - add_subdirectory(ggml-cann/kernels) list(APPEND CANN_LIBRARIES ascendcl @@ -839,6 +878,7 @@ if (GGML_CANN) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} ) set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS}) + set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64) list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN) endif() else() @@ -869,8 +909,10 @@ function(get_flags CCID CCVER) set(C_FLAGS -Wdouble-promotion) set(CXX_FLAGS -Wno-array-bounds) - if (CCVER VERSION_GREATER_EQUAL 7.1.0) - list(APPEND CXX_FLAGS -Wno-format-truncation) + if (NOT GGML_MUSA) + if (CCVER VERSION_GREATER_EQUAL 7.1.0) + list(APPEND CXX_FLAGS -Wno-format-truncation) + endif() endif() if (CCVER VERSION_GREATER_EQUAL 8.1.0) list(APPEND CXX_FLAGS -Wextra-semi) @@ -1278,6 +1320,7 @@ endif() target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) target_include_directories(ggml PUBLIC ../include) target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) +target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) target_compile_features (ggml PRIVATE c_std_11) # don't bump target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index af53dea1..7adaadc9 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -16,6 +16,8 @@ #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" +#elif defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data #endif #define UNUSED GGML_UNUSED @@ -384,8 +386,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -496,8 +498,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -614,7 +616,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -680,12 +682,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } @@ -745,8 +747,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1266,8 +1268,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (svcntw() == 8) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) && + if (ggml_sve_cnt_b == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1728,7 +1730,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (svcntw() == 8) { + if (ggml_sve_cnt_b == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -2139,12 +2141,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index e176b883..e485326a 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); - GGML_ASSERT(!"not enough space in the buffer"); - return; + GGML_ABORT("not enough space in the buffer"); } void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; @@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, return; } } - GGML_ASSERT(!"out of allocated_tensors"); + GGML_ABORT("out of allocated_tensors"); } static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { @@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs return; } } - fprintf(stderr, "tried to free tensor %s not found\n", tensor->name); - GGML_ASSERT(!"tensor not found"); + GGML_ABORT("tried to free tensor %s not found\n", tensor->name); } #endif @@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz // this should never happen fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", __func__, size, max_avail); - GGML_ASSERT(!"not enough space in the buffer"); - GGML_UNREACHABLE(); + GGML_ABORT("not enough space in the buffer"); } } @@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { } } - free(galloc->hash_set.keys); + ggml_hash_set_free(&galloc->hash_set); free(galloc->hash_values); free(galloc->bufts); free(galloc->buffers); @@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { typedef struct ggml_gallocr * ggml_gallocr_t; static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { - size_t i = ggml_hash_find_or_insert(galloc->hash_set, t); + size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t); return &galloc->hash_values[i]; } @@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) { static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { // clear hash tables - memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); - memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); + ggml_hash_set_reset(&galloc->hash_set); + memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); // allocate leafs // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes @@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { - size_t hash_size = graph->visited_hash_table.size; + size_t min_hash_size = graph->n_nodes + graph->n_leafs; + // add 25% margin to avoid hash collisions + min_hash_size += min_hash_size / 4; // initialize hash table - if (galloc->hash_set.size < hash_size) { - free(galloc->hash_set.keys); - free(galloc->hash_values); - galloc->hash_set.size = hash_size; - galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *)); - galloc->hash_values = calloc(hash_size, sizeof(struct hash_node)); + if (galloc->hash_set.size < min_hash_size) { + ggml_hash_set_free(&galloc->hash_set); + galloc->hash_set = ggml_hash_set_new(min_hash_size); GGML_ASSERT(galloc->hash_set.keys != NULL); + + free(galloc->hash_values); + galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size); GGML_ASSERT(galloc->hash_values != NULL); - } else { - // reset hash table - memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size); - memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); } // reset allocators @@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * } static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { - ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL; - size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); + size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); return talloc->size_max >= node_size; } diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index d39cfed8..e1651cc6 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b } // an async copy would normally happen after all the queued operations on both backends are completed - // sync src, set_async dst - if (ggml_backend_buffer_is_host(src->buffer)) { - ggml_backend_synchronize(backend_src); - ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src)); - } else { - ggml_backend_synchronize(backend_src); - ggml_backend_tensor_copy(src, dst); - ggml_backend_synchronize(backend_dst); - } + // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy + ggml_backend_synchronize(backend_src); + ggml_backend_synchronize(backend_dst); + ggml_backend_tensor_copy(src, dst); } // events @@ -1055,11 +1050,10 @@ struct ggml_backend_sched { ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS]; ggml_gallocr_t galloc; - // hash keys of the nodes in the graph - struct ggml_hash_set hash_set; - // hash values - int * tensor_backend_id; - struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; + // hash map of the nodes in the graph + struct ggml_hash_set hash_set; + int * hv_tensor_backend_ids; // [hash_set.size] + struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies] int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] @@ -1068,7 +1062,7 @@ struct ggml_backend_sched { int * prev_leaf_backend_ids; // [graph_size] // copy of the graph with modified inputs - struct ggml_cgraph * graph; + struct ggml_cgraph graph; // graph splits struct ggml_backend_sched_split * splits; @@ -1087,19 +1081,16 @@ struct ggml_backend_sched { ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; - bool debug; + char * context_buffer; + size_t context_buffer_size; - // align context_buffer to GGML_MEM_ALIGN -#ifdef _MSC_VER - __declspec(align(GGML_MEM_ALIGN)) -#else - __attribute__((aligned(GGML_MEM_ALIGN))) -#endif - char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; + bool debug; }; -#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor) -#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)] +#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) +#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)] +#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)] +#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id) // returns the priority of the backend, lower id is higher priority static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { @@ -1169,7 +1160,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st return cur_backend_id; } - // assign nodes that use weights to the backend of the weights // operations with weights are preferably run on the same backend as the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; @@ -1275,7 +1265,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->is_reset = false; struct ggml_init_params params = { - /* .mem_size = */ sizeof(sched->context_buffer), + /* .mem_size = */ sched->context_buffer_size, /* .mem_buffer = */ sched->context_buffer, /* .no_alloc = */ true }; @@ -1284,39 +1274,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->ctx = ggml_init(params); if (sched->ctx == NULL) { - fprintf(stderr, "%s: failed to initialize context\n", __func__); - GGML_ASSERT(false); + GGML_ABORT("%s: failed to initialize context\n", __func__); } // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; int * leaf_backend_id = &tensor_backend_id(leaf); - if (*leaf_backend_id != -1) { - // do not overwrite user assignments - continue; + // do not overwrite user assignments + if (*leaf_backend_id == -1) { + *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } - *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); - if (*node_backend_id != -1) { - // do not overwrite user assignments - continue; - } - *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); - // src - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + // do not overwrite user assignments + if (*node_backend_id == -1) { + *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); + +#if 0 + // src + if (node->op == GGML_OP_NONE) { continue; } - int * src_backend_id = &tensor_backend_id(src); - if (*src_backend_id == -1) { - *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { + *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); + } } +#endif } } @@ -1488,12 +1482,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - // pass 4: split graph, find tensors that need to be copied + // pass 5: split graph, find tensors that need to be copied { int i_split = 0; struct ggml_backend_sched_split * split = &sched->splits[0]; // find the backend of the first split, skipping view ops - for (int i = 0; i < graph->n_nodes; i++) { + int i = 0; + for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { split->backend_id = tensor_backend_id(node); @@ -1502,9 +1497,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } split->i_start = 0; split->n_inputs = 0; - memset(split->inputs, 0, sizeof(split->inputs)); //HACK int cur_backend_id = split->backend_id; - for (int i = 0; i < graph->n_nodes; i++) { + for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { @@ -1513,7 +1507,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const int node_backend_id = tensor_backend_id(node); - GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now + assert(node_backend_id != -1); // all nodes should be assigned by now // check if we should start a new split based on the sources of the current node bool need_new_split = false; @@ -1527,7 +1521,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // by starting a new split, the memory of the previously offloaded weights can be reused if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = tensor_backend_id(src); - if (src_backend_id != -1 && src_backend_id != cur_backend_id) { + if (src_backend_id != cur_backend_id) { need_new_split = true; break; } @@ -1536,9 +1530,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // FIXME: count the number of inputs instead of only checking when full if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); - int src_backend_id = sched->tensor_backend_id[id]; + int src_backend_id = sched->hv_tensor_backend_ids[id]; bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { + if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; break; @@ -1570,12 +1564,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - const int src_backend_id = tensor_backend_id(src); + size_t src_id = hash_id(src); + const int src_backend_id = sched->hv_tensor_backend_ids[src_id]; assert(src_backend_id != -1); // all inputs should be assigned by now if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { - size_t id = hash_id(src); - if (sched->tensor_copies[id][src_backend_id][0] == NULL) { + if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy; @@ -1589,7 +1583,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } - sched->tensor_copies[id][src_backend_id][c] = tensor_copy; + tensor_id_copy(src_id, src_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } int n_graph_inputs = sched->n_graph_inputs++; @@ -1598,11 +1592,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && !supported) { + if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { // create a copy of the input in the split's backend - const size_t id = hash_id(src); - if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { + if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); @@ -1611,14 +1603,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } - sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; + tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split->inputs[n_inputs] = src; } - node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; + node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } } } @@ -1630,7 +1622,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_backend_sched_print_assignments(sched, graph); } - // swap node_backend_ids and leaf_backend_ids and prevs + // swap node_backend_ids and leaf _backend_ids with prevs { int * tmp = sched->node_backend_ids; sched->node_backend_ids = sched->prev_node_backend_ids; @@ -1641,9 +1633,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } - // create copies of the graph for each split - // TODO: avoid this copy - struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); + int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; + if (sched->graph.size < graph_size) { + sched->graph.size = graph_size; + sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); + sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); + GGML_ASSERT(sched->graph.nodes != NULL); + GGML_ASSERT(sched->graph.leafs != NULL); + } + sched->graph.n_nodes = 0; + sched->graph.n_leafs = 0; + + struct ggml_cgraph * graph_copy = &sched->graph; + for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); @@ -1654,12 +1656,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg struct ggml_tensor * input = split->inputs[j]; const size_t input_id = hash_id(input); - struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy]; + struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy); // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); input_dep->src[0] = input; - sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id]; + sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id]; graph_copy->nodes[graph_copy->n_nodes++] = input_dep; // add a dependency to the input copy so that it is allocated at the start of the split @@ -1681,7 +1683,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg size_t id = hash_id(input); int backend_id = tensor_backend_id(input); for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; + struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; } @@ -1694,7 +1696,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg struct ggml_tensor * input = split->inputs[j]; size_t id = hash_id(input); for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; + struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; } @@ -1708,13 +1710,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf); graph_copy->leafs[graph_copy->n_leafs++] = leaf; } - - sched->graph = graph_copy; } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { bool backend_ids_changed = false; - for (int i = 0; i < sched->graph->n_nodes; i++) { + for (int i = 0; i < sched->graph.n_nodes; i++) { if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { backend_ids_changed = true; @@ -1722,7 +1722,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { } } if (!backend_ids_changed) { - for (int i = 0; i < sched->graph->n_leafs; i++) { + for (int i = 0; i < sched->graph.n_leafs; i++) { if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { backend_ids_changed = true; @@ -1732,14 +1732,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { } // allocate graph - if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__); + fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); #endif - ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); - if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); + if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { fprintf(stderr, "%s: failed to allocate graph\n", __func__); return false; } @@ -1760,7 +1760,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s for (int j = 0; j < split->n_inputs; j++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy]; + struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done @@ -1777,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); + // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events + // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface + if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { + ggml_backend_synchronize(input_backend); + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); + } else { + ggml_backend_synchronize(split_backend); + } + ggml_backend_tensor_copy(input, input_cpy); + } } } @@ -1846,21 +1856,23 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; + sched->n_backends = n_backends; + sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size); - sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); - sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0])); + // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead) + sched->hash_set = ggml_hash_set_new(graph_size); + sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); + sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); - sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); + sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); - sched->n_backends = n_backends; - - sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; + sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); + sched->context_buffer = malloc(sched->context_buffer_size); const int initial_splits_capacity = 16; sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0])); @@ -1895,37 +1907,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); + ggml_hash_set_free(&sched->hash_set); free(sched->splits); - free(sched->hash_set.keys); - free(sched->tensor_backend_id); - free(sched->tensor_copies); + free(sched->hv_tensor_backend_ids); + free(sched->hv_tensor_copies); free(sched->node_backend_ids); free(sched->leaf_backend_ids); free(sched->prev_node_backend_ids); free(sched->prev_leaf_backend_ids); + free(sched->context_buffer); + free(sched->graph.nodes); + free(sched->graph.leafs); free(sched); } void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run if (!sched->is_reset) { - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT - memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); - memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - + ggml_hash_set_reset(&sched->hash_set); + memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); + memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); sched->is_reset = true; } sched->is_alloc = false; } bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes); + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); ggml_backend_sched_split_graph(sched, measure_graph); - // TODO: extract this to a separate function - if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { + if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { return false; } @@ -1936,10 +1948,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * } bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs); ggml_backend_sched_split_graph(sched, graph); + if (!ggml_backend_sched_alloc_splits(sched)) { return false; } @@ -2009,6 +2022,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; SET_CAUSE(node, "usr"); + sched->is_reset = false; } ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { @@ -2051,9 +2065,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, GGML_ASSERT(src != NULL); GGML_ASSERT(src->data && "graph must be allocated"); - size_t id = ggml_hash_insert(hash_set, src); - if (id == GGML_HASHTABLE_ALREADY_EXISTS) { - return node_copies[ggml_hash_find(hash_set, src)]; + size_t id = ggml_hash_insert(&hash_set, src); + if (id == GGML_HASHSET_ALREADY_EXISTS) { + return node_copies[ggml_hash_find(&hash_set, src)]; } struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); @@ -2078,7 +2092,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, return dst; } -static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { +static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { size_t id = ggml_hash_find(hash_set, src); if (node_init[id]) { return; @@ -2105,10 +2119,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te } struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { - struct ggml_hash_set hash_set = { - /* .size = */ graph->visited_hash_table.size, - /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT - }; + struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT bool * node_init = calloc(hash_set.size, sizeof(node_init[0])); @@ -2123,7 +2134,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s if (ctx_allocated == NULL || ctx_unallocated == NULL) { fprintf(stderr, "failed to allocate context for graph copy\n"); - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); @@ -2146,7 +2157,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); if (buffer == NULL) { fprintf(stderr, "failed to allocate buffer for graph copy\n"); - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); @@ -2164,19 +2175,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // copy data and init views for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - graph_copy_init_tensor(hash_set, node_copies, node_init, node); + graph_copy_init_tensor(&hash_set, node_copies, node_init, node); } // build graph copy struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false); for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)]; + struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)]; graph_copy->nodes[i] = node_copy; } graph_copy->n_nodes = graph->n_nodes; - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index a37aa407..71373173 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t break; default: - fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - GGML_ASSERT(false); + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); } } diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 9bf7e332..06930ba2 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) { file, line); GGML_CANN_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace - GGML_ASSERT(!"CANN error"); + GGML_ABORT("CANN error"); } /** @@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { // memory should always buffered. these memory may still needed by // tasks in stream. // TODO, fix me. - GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); + GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); } }; @@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base( GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, const void* src, void* dst) { - GGML_ASSERT(tensor->op == GGML_OP_NONE); int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK4_0; @@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, */ GGML_CALL static void ggml_backend_cann_transform_back_q4_0( const ggml_tensor* tensor, void* src, void* dst) { - GGML_ASSERT(tensor->op == GGML_OP_NONE); int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK4_0; @@ -898,11 +896,10 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor( * @param size Size of the data to be copied, in bytes. */ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( - ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data, + ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { - // GGML_ASSERT(size == ggml_nbytes(tensor)); - ggml_backend_cann_buffer_context* ctx = - (ggml_backend_cann_buffer_context*)buffer->context; + ggml_backend_cann_buffer_context *ctx = + (ggml_backend_cann_buffer_context *)buffer->context; ggml_cann_set_device(ctx->device); // TODO: refer to cann(#6017), it use thread's default stream. @@ -910,22 +907,21 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( // Why aclrtSynchronizeDevice? if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset, - size, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, + ACL_MEMCPY_HOST_TO_DEVICE)); } else { - void* transform_buffer = malloc(size); - ggml_backend_cann_transform(tensor, (const char*)data + offset, - transform_buffer); + void *transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, data, transform_buffer); #ifndef NDEBUG - void* check_buffer = malloc(size); + void *check_buffer = malloc(size); ggml_backend_cann_transform_back(tensor, transform_buffer, check_buffer); - GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) == - 0); + GGML_ASSERT(memcmp(data, check_buffer, size) == 0); free(check_buffer); #endif - ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size, + ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, + transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE)); free(transform_buffer); } @@ -947,21 +943,20 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor( GGML_CALL static void ggml_backend_cann_buffer_get_tensor( ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, size_t offset, size_t size) { - GGML_ASSERT(size == ggml_nbytes(tensor)); ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; ggml_cann_set_device(ctx->device); if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size, + ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST)); } else { void* transform_buffer = malloc(size); - ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size, + ACL_CHECK(aclrtMemcpy(transform_buffer, size, + (char*)tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST)); - ggml_backend_cann_transform_back(tensor, transform_buffer, - (char*)data + offset); + ggml_backend_cann_transform_back(tensor, transform_buffer, data); free(transform_buffer); } } @@ -1450,42 +1445,41 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { * @param size Size of the data to copy in bytes. */ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, - ggml_tensor* tensor, - const void* data, + ggml_tensor *tensor, + const void *data, size_t offset, size_t size) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; + ggml_backend_cann_context *cann_ctx = + (ggml_backend_cann_context *)backend->context; if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpyAsync( - tensor->data, size, (const char*)data + offset, size, - ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream())); + ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data, + size, ACL_MEMCPY_HOST_TO_DEVICE, + cann_ctx->stream())); } else { - void* transform_buffer = malloc(size); - ggml_backend_cann_transform(tensor, (const char*)data + offset, - transform_buffer); + void *transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, data, transform_buffer); #ifndef NDEBUG - void* check_buffer = malloc(size); + void *check_buffer = malloc(size); ggml_backend_cann_transform_back(tensor, transform_buffer, check_buffer); - GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size)); + GGML_ASSERT(memcmp(data, check_buffer, size)); free(check_buffer); #endif - ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size, - ACL_MEMCPY_HOST_TO_DEVICE, - cann_ctx->stream())); + ACL_CHECK(aclrtMemcpyAsync( + (char *)tensor->data + offset, size, transform_buffer, size, + ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream())); ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); free(transform_buffer); } } GGML_CALL static void ggml_backend_cann_get_tensor_async( - ggml_backend_t backend, const ggml_tensor* tensor, void* data, + ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; + ggml_backend_cann_context *cann_ctx = + (ggml_backend_cann_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -1493,17 +1487,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async( "unsupported buffer type"); if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data, + ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream())); } else { - void* transform_buffer = malloc(size); - ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size, - ACL_MEMCPY_DEVICE_TO_HOST, - cann_ctx->stream())); + void *transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpyAsync( + transform_buffer, size, (char *)tensor->data + offset, size, + ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream())); ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); - ggml_backend_cann_transform_back(tensor, transform_buffer, - (char*)data + offset); + ggml_backend_cann_transform_back(tensor, transform_buffer, data); free(transform_buffer); } } @@ -1559,23 +1552,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( return false; } + // need open both directions for memcpyasync between devices. + ggml_cann_set_device(cann_ctx_dst->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0)); ggml_cann_set_device(cann_ctx_src->device); ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0)); + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, - cann_ctx_dst->stream())); - - // record event on src stream - if (!cann_ctx_src->copy_event) { - ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event)); - } - - ACL_CHECK( - aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream())); + cann_ctx_src->stream())); - // wait on dst stream for the copy to complete - ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), - cann_ctx_src->copy_event)); + //TODO: workaround for Event didn`t work here. + aclrtSynchronizeStream(cann_ctx_src->stream()); } else { // src and dst are on the same backend ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, @@ -1671,10 +1659,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, } case GGML_OP_MUL_MAT: { switch (op->src[0]->type) { - // case GGML_TYPE_Q4_0: case GGML_TYPE_F16: case GGML_TYPE_F32: case GGML_TYPE_Q8_0: + // TODO: fix me + // Current groupsize should not be greater than k-1 in + // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(). + case GGML_TYPE_Q4_0: return true; default: return false; @@ -1699,6 +1690,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: return true; default: return false; @@ -1763,8 +1755,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { * * This function determines whether the CANN backend supports the given backend * buffer type by comparing the device context of the backend and buffer type. - * It returns true if the device associated with the buffer type matches the - * device associated with the backend. + * It returns true if the devices are same between the backend context and + * buffer type context. * * @param backend Pointer to the CANN backend. * @param buft Pointer to the backend buffer type to check. @@ -1773,9 +1765,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { */ GGML_CALL static bool ggml_backend_cann_supports_buft( ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_cann_buffer_type_name; - - GGML_UNUSED(backend); + if (ggml_backend_buft_is_cann(buft)) { + ggml_backend_cann_context * cann_ctx = + (ggml_backend_cann_context *)backend->context; + ggml_backend_cann_buffer_type_context * buft_ctx = + (ggml_backend_cann_buffer_type_context *)buft->context; + return buft_ctx->device == cann_ctx->device; + } + return false; } /** @@ -1874,7 +1871,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent)event->context)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp index 960ce9a0..d120ce6a 100644 --- a/ggml/src/ggml-cann/acl_tensor.cpp +++ b/ggml/src/ggml-cann/acl_tensor.cpp @@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) { return ACL_INT16; case GGML_TYPE_I32: return ACL_INT32; + case GGML_TYPE_Q4_0: + return ACL_INT4; + case GGML_TYPE_Q8_0: + return ACL_INT8; default: return ACL_DT_UNDEFINED; } @@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) { return false; } -aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype, - size_t type_size, int64_t* ne, size_t* nb, - int64_t dims, aclFormat format, - size_t offset) { - int64_t tmp_ne[GGML_MAX_DIMS * 2]; - int64_t tmp_stride[GGML_MAX_DIMS * 2]; - - memcpy(tmp_ne, ne, dims * sizeof(int64_t)); - for (int i = 0; i < dims; i++) { - tmp_stride[i] = nb[i] / type_size; - } - - std::reverse(tmp_ne, tmp_ne + dims); - std::reverse(tmp_stride, tmp_stride + dims); - - int64_t acl_storage_len = 0; - for (int i = 0; i < dims; i++) { - acl_storage_len += (ne[i] - 1) * nb[i]; - } - - aclTensor* acl_tensor = - aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, - format, &acl_storage_len, 1, data_ptr); - - return acl_tensor; -} - int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, int64_t* bcast_src0_ne, diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h index 7d0bf04e..4734a9cb 100644 --- a/ggml/src/ggml-cann/acl_tensor.h +++ b/ggml/src/ggml-cann/acl_tensor.h @@ -23,6 +23,9 @@ #ifndef CANN_ACL_TENSOR_H #define CANN_ACL_TENSOR_H +#include <algorithm> +#include <cstring> + #include <aclnn/aclnn_base.h> #include "common.h" @@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null size_t offset = 0); /** - * @brief Creates an ACL tensor from provided parameters. + * @brief Template for creating an ACL tensor from provided parameters. typename TYPE + * should be size_t or float. * * @details This function creates an ACL tensor using the provided data pointer, * data type, dimensions, strides, format, offset, and additional parameters. @@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null * @param offset Offset in bytes for the ACL tensor data. Defaults to 0. * @return Pointer to the created ACL tensor. */ +template<typename TYPE> aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype, - size_t type_size, int64_t* ne, size_t* nb, - int64_t dims, aclFormat format = ACL_FORMAT_ND, - size_t offset = 0); + TYPE type_size, int64_t* ne, TYPE* nb, + int64_t dims, + aclFormat format = ACL_FORMAT_ND, + size_t offset = 0) { + int64_t tmp_ne[GGML_MAX_DIMS * 2]; + int64_t tmp_stride[GGML_MAX_DIMS * 2]; + + memcpy(tmp_ne, ne, dims * sizeof(int64_t)); + for (int i = 0; i < dims; i++) { + tmp_stride[i] = nb[i] / type_size; + } + + std::reverse(tmp_ne, tmp_ne + dims); + std::reverse(tmp_stride, tmp_stride + dims); + + int64_t acl_storage_len = 0; + for (int i = 0; i < dims; i++) { + acl_storage_len += (ne[i] - 1) * nb[i]; + } + + aclTensor* acl_tensor = + aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, + format, &acl_storage_len, 1, data_ptr); + + return acl_tensor; +} /** * @brief Checks if tensors require broadcasting based on their shapes. diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a02efc82..8c4132f5 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src = ggml_cann_create_tensor(src); aclTensor* acl_dst = ggml_cann_create_tensor(dst); - const float eps = 1e-6f; // TODO: make this a parameter int n_groups = dst->op_params[0]; + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -844,7 +846,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_max_pool2d(ctx, dst); break; case GGML_OP_POOL_COUNT: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -910,6 +912,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->ne); return; } + if (dst->type == GGML_TYPE_Q4_0) { + aclrtlaunch_ascendc_quantize_f16_to_q4_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } if (dst->type == GGML_TYPE_F16) { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -931,9 +940,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (dst->type == GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { @@ -955,12 +964,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // TODO - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else if (src->type == GGML_TYPE_F32) { // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size // && nb0 == type_size) @@ -971,6 +980,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->ne); return; } + if (dst->type == GGML_TYPE_Q4_0) { + aclrtlaunch_ascendc_quantize_f32_to_q4_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } if (dst->type == GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -991,10 +1007,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { // TODO: dst not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } if (dst->type == GGML_TYPE_F16) { @@ -1017,11 +1033,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } // TODO - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -1029,7 +1045,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_dst)); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -1312,6 +1328,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize, #ifdef __cplusplus } #endif + +static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + ggml_tensor* src1, + aclTensor* tmp_cast_tensor, + aclTensor* tmp_im2col_tensor) { + // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] + int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; + size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + } + + // release + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void ggml_cann_im2col_1d_post_process( + ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1, + aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor, + const std::vector<int64_t>& im2col_op_params) { + // get params + const int64_t KH = im2col_op_params[0]; + const int64_t KW = im2col_op_params[1]; + const int64_t IW = im2col_op_params[2]; + const int64_t IC = im2col_op_params[3]; + const int64_t N = im2col_op_params[4]; + const int64_t OH = im2col_op_params[5]; + const int64_t OW = im2col_op_params[6]; + const int64_t s0 = im2col_op_params[7]; + const int64_t p0 = im2col_op_params[8]; + const int64_t d0 = im2col_op_params[9]; + const int64_t n_bytes_factor = im2col_op_params[10]; + + // Permute: [N, IC * KH * KW, OW * OH] -> + // [N, OW * OH * n_bytes_factor, IC * KH * KW] + aclTensor* tmp_permute_tensor = nullptr; + ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool()); + tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + void* tmp_permute_buffer = tmp_permute_allocator.get(); + + int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N}; + size_t tmp_permute_nb[GGML_MAX_DIMS - 1]; + tmp_permute_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; + } + + tmp_permute_tensor = ggml_cann_create_tensor( + tmp_permute_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb, + GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, + 3); + } + + // number of times the kernel moves in W dimension + const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1; + size_t offset; + void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer; + + // memory copy with offset to restore 1D im2col from 2d + if (IC > 1) { + offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type); + size_t size_cpy = KH * KW * ggml_type_size(dst->type); + + for (int c = 0; c < IC; c++) { + cur_permute_buffer = (char*)tmp_permute_buffer + offset + + KH * KW * c * ggml_type_size(dst->type); + cur_dst_buffer = (char*)dst->data + + c * KH * KW * n_step_w * ggml_type_size(dst->type); + + for (int i = 0; i < n_step_w; i++) { + ACL_CHECK(aclrtMemcpyAsync( + cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + cur_dst_buffer = + (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type); + cur_permute_buffer = (char*)cur_permute_buffer + + KH * KW * IC * ggml_type_size(dst->type); + } + } + } else { + offset = KH * KW * n_step_w * + ggml_type_size(dst->type); // equal to ggml_nbytes(dst) + ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, + (char*)tmp_permute_buffer + offset, offset, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + } + + // release + ACL_CHECK(aclDestroyTensor(tmp_permute_tensor)); +} + void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src0 = dst->src[0]; // kernel ggml_tensor* src1 = dst->src[1]; // input @@ -1320,21 +1441,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + GGML_TENSOR_BINARY_OP_LOCALS; + + // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D + // im2col and do post-processing to restore it to 1D. + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1; const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1; const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; - const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1; - GGML_TENSOR_BINARY_OP_LOCALS; - - const int64_t N = is_2D ? ne13 : ne12; - const int64_t IC = is_2D ? ne12 : ne11; - - const int64_t KH = is_2D ? ne01 : 1; + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t KH = ne01; const int64_t KW = ne00; + const int64_t IW = ne10; const int64_t OH = is_2D ? ne2 : 1; const int64_t OW = ne1; @@ -1342,9 +1465,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH] + // memory allocated increased to 3x when is_2D == false + const int64_t n_bytes_factor = is_2D ? 1 : 3; + + // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor] aclTensor* acl_src1 = ggml_cann_create_tensor(src1); - int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N}; + int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N}; size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; tmp_im2col_nb[0] = ggml_type_size(src1->type); @@ -1356,8 +1482,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * // dst.elemcount. ggml_cann_pool_alloc im2col_allocator( - ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1)); + ctx.pool(), + ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor); void* tmp_im2col_buffer = im2col_allocator.get(); + aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor( tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb, @@ -1380,8 +1508,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { paddings, strides, tmp_im2col_tensor, &workspaceSize, &executor)); + ggml_cann_pool_alloc workspace_allocator(ctx.pool()); if (workspaceSize > 0) { - ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspace_allocator.alloc(workspaceSize); workspaceAddr = workspace_allocator.get(); } @@ -1391,9 +1520,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // Cast if dst is f16. aclTensor* tmp_cast_tensor = nullptr; ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool()); + void* tmp_cast_buffer = nullptr; if (src1->type != dst->type) { - tmp_cast_allocator.alloc(ggml_nbytes(dst)); - void* tmp_cast_buffer = tmp_cast_allocator.get(); + tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + tmp_cast_buffer = tmp_cast_allocator.get(); size_t temp_cast_nb[GGML_MAX_DIMS - 1]; temp_cast_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { @@ -1408,24 +1538,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_type_mapping(dst->type)); } - // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] - int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; - size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); - - int64_t permute_dim[] = {0, 2, 1}; - if (src1->type != dst->type) { - aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + // post-processing + if (is_2D) { + ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor); } else { - aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + std::vector<int64_t> im2col_op_params = { + KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor}; + ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor, im2col_op_params); } // release ACL_CHECK(aclDestroyTensor(acl_src1)); ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor)); ACL_CHECK(aclDestroyTensor(tmp_cast_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyIntArray(kernel_size)); ACL_CHECK(aclDestroyIntArray(dilations)); ACL_CHECK(aclDestroyIntArray(paddings)); @@ -2219,7 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2352,21 +2479,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, * @param dst The destination tensor where the result of the matrix * multiplication will be stored. */ -static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { +static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + const enum ggml_type type) { ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src1 = dst->src[1]; // input // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC // is regarded as batch. weight need transpose. int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; - size_t weight_elem_size = sizeof(uint8_t); - size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; + float weight_elem_size; + if (type == GGML_TYPE_Q4_0) { + weight_elem_size = float(sizeof(uint8_t)) / 2; + } + else if (type == GGML_TYPE_Q8_0) { + weight_elem_size = float(sizeof(uint8_t)); + } + else { + GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); + } + float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; + // size of one matrix is element_size * height * width. size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1]; size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; // scale stored at the end of weight. Also need transpose. + GGML_ASSERT(QK4_0 == QK8_0); int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0}; size_t scale_elem_size = sizeof(uint16_t); size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, @@ -2381,10 +2520,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; + ggml_cann_pool_alloc input_alloctor(ctx.pool()); if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - ggml_cann_pool_alloc input_alloctor( - ctx.pool(), ggml_nelements(src1) * input_elem_size); + input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); input_buffer = input_alloctor.get(); int64_t* input_cast_ne = src1->ne; @@ -2430,8 +2569,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2); aclTensor* acl_weight_tensor = ggml_cann_create_tensor( - (char*)src0->data + batch0 * weight_stride, ACL_INT8, - weight_elem_size, weight_ne, weight_nb, 2); + (char*)src0->data + batch0 * weight_stride, + ggml_cann_type_mapping(type), weight_elem_size, weight_ne, + weight_nb, 2); aclTensor* acl_scale_tensor = ggml_cann_create_tensor( scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 2); @@ -2485,14 +2625,12 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { case GGML_TYPE_F16: ggml_cann_mat_mul_fp(ctx, dst); break; - // case GGML_TYPE_Q4_0: - // ggml_cann_mul_mat_q4_0(ctx, dst); - // break; + case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: - ggml_cann_mul_mat_q8_0(ctx, dst); + ggml_cann_mul_mat_quant(ctx, dst, type); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt index f12a4d43..5b4fef91 100644 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -9,6 +9,7 @@ file(GLOB SRC_FILES get_row_q8_0.cpp quantize_f32_q8_0.cpp quantize_f16_q8_0.cpp + quantize_float_to_q4_0.cpp dup.cpp ) @@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC ${SRC_FILES} ) -#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) +# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h index bf891475..7e153208 100644 --- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h +++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h @@ -8,6 +8,8 @@ #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" +#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h" +#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h" #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp new file mode 100644 index 00000000..9c8c86b6 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp @@ -0,0 +1,278 @@ +#include "kernel_operator.h" + +using namespace AscendC; + +#define BUFFER_NUM 2 +#define Group_Size 32 + +template <typename SRC_T> +class QUANTIZE_FLOAT_TO_Q4_0 { + public: + __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *output_ne_ub) { + // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4], + // permute=[0,0,0,0]): + // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + // input stride of data elements + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + output_ne[i] = output_ne_ub[i]; + } + + // output stride of data elements + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; + } + + // scale saved one by one after data:. [group1_scale, group2_scale, ...] + scale_ne = input_ne; + scale_stride[0] = 1; + scale_stride[1] = input_ne[0] / Group_Size; + for (int i = 2; i < 4; i++) { + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + // split input tensor by rows. + uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; + dr = nr / op_block_num; + + uint64_t tails = nr % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + group_size_in_row = scale_stride[1]; + int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] * + output_ne[3] * sizeof(uint8_t) / 2; + + input_gm.SetGlobalBuffer((__gm__ SRC_T *)input); + output_gm.SetGlobalBuffer((__gm__ int8_t *)output); + scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir * + group_size_in_row * + sizeof(half))); + + pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T)); + pipe.InitBuffer(output_queue, BUFFER_NUM, + Group_Size * sizeof(int8_t) / 2); + pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float)); + pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half)); + pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t)); + pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>(); + DataCopy(input_local, input_gm[offset], Group_Size); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t, + // and using DataCopyPad to avoid 32 bits align. + LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>(); + LocalTensor<int8_t> output_int8_local = + output_local.ReinterpretCast<int8_t>(); + + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t); + DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams); + + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void input_to_cast(LocalTensor<float> cast_local, + LocalTensor<float> input_local) { + DataCopy(cast_local, input_local, Group_Size); + } + + __aicore__ inline void input_to_cast(LocalTensor<float> cast_local, + LocalTensor<half> input_local) { + Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size); + } + + __aicore__ inline half calculate_group(int64_t row, int64_t group) { + const int64_t i3 = row / (input_ne[1] * input_ne[2]); + const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; + const int64_t i1 = + row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; + + const int64_t input_offset = i1 * input_stride[1] + + i2 * input_stride[2] + + i3 * input_stride[3] + Group_Size * group; + + // output_offset is stride for output_gm which datatype is int8_t and + // divided by 2 is needed for int4b_t. + const int64_t output_offset = (i1 * output_stride[1] + + i2 * output_stride[2] + + i3 * output_stride[3] + + Group_Size * group) / 2; + copy_in(input_offset); + + LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>(); + LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>(); + LocalTensor<float> cast_local = cast_queue.AllocTensor<float>(); + LocalTensor<float> work_local = work_queue.AllocTensor<float>(); + LocalTensor<float> max_local = max_queue.AllocTensor<float>(); + LocalTensor<float> min_local = min_queue.AllocTensor<float>(); + LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>(); + LocalTensor<half> half_local = half_queue.AllocTensor<half>(); + + input_to_cast(cast_local, input_local); + + ReduceMax(max_local, cast_local, work_local, Group_Size); + ReduceMin(min_local, cast_local, work_local, Group_Size); + const float max_value = max_local.GetValue(0); + const float min_value = min_local.GetValue(0); + float d = max_value; + if (min_value < 0 && (-1 * min_value) > max_value) { + d = min_value; + } + + d = d / (-8); + if (d != 0) { + Muls(cast_local, cast_local, 1.0f / d, Group_Size); + } + + // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7] + float scalar = 8.5f; + Adds(cast_local, cast_local, scalar, Group_Size); + Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size); + scalar = 15.0f; + Mins(cast_local, cast_local, scalar, Group_Size); + scalar = -8.0f; + Adds(cast_local, cast_local, scalar, Group_Size); + + // float->half->int4b + Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size); + Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size); + + output_queue.EnQue(output_local); + copy_out(output_offset); + + input_queue.FreeTensor(input_local); + work_queue.FreeTensor(work_local); + max_queue.FreeTensor(max_local); + min_queue.FreeTensor(min_local); + int8_queue.FreeTensor(int8_local); + half_queue.FreeTensor(half_local); + cast_queue.FreeTensor(cast_local); + return (half)d; + } + + __aicore__ inline void calculate() { + LocalTensor<half> scale_local = scale_queue.AllocTensor<half>(); + uint32_t scale_local_offset = 0; + uint32_t scale_global_offset = 0; + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + half scale = calculate_group(i, j); + scale_local.SetValue(scale_local_offset++, scale); + // Copy Group_Size/2 length data each time. + if (scale_local_offset == Group_Size / 2) { + scale_local_offset = 0; + // TODO: OPTIMIZE ME + pipe_barrier(PIPE_ALL); + DataCopy(scale_gm[scale_global_offset], scale_local, + Group_Size / 2); + pipe_barrier(PIPE_ALL); + scale_global_offset += Group_Size / 2; + } + } + } + + if (scale_local_offset != 0) { + pipe_barrier(PIPE_ALL); + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = scale_local_offset * sizeof(half); + DataCopyPad(scale_gm[scale_global_offset], scale_local, + dataCopyParams); + pipe_barrier(PIPE_ALL); + } + scale_queue.FreeTensor(scale_local); + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t *scale_ne; + size_t scale_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t group_size_in_row; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor<SRC_T> input_gm; + GlobalTensor<half> scale_gm; + GlobalTensor<int8_t> output_gm; + TQue<QuePosition::VECIN, BUFFER_NUM> input_queue; + TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue; + TQue<QuePosition::VECIN, BUFFER_NUM> work_queue; + TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue; + TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue; + TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue; + TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue; + TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue; + TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue; +}; + +template <typename T> +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_FLOAT_TO_Q4_0<half> op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_FLOAT_TO_Q4_0<float> op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index b58e3e4c..57fdeb82 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -26,7 +26,11 @@ typedef half2 ggml_half2; #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_CUDA) +#if defined(GGML_COMMON_DECL_MUSA) +#include <musa_fp16.h> +#else #include <cuda_fp16.h> +#endif #include <cstdint> typedef half ggml_half; @@ -527,7 +531,7 @@ static_assert(sizeof(block_iq6_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K #define GGML_TABLE_END() }; #define GGML_COMMON_IMPL -#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) +#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA) #include <cstdint> #define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 7641d5b5..f594cd26 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_CUDA_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace - GGML_ASSERT(!"CUDA error"); + GGML_ABORT("CUDA error"); } // this is faster on Windows @@ -130,7 +130,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) } return res; #else + +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + cudaError_t err; + if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) + { + err = cudaMallocManaged(ptr, size); + } + else + { + err = cudaMalloc(ptr, size); + } + return err; +#else return cudaMalloc(ptr, size); +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + #endif } @@ -167,7 +182,7 @@ static ggml_cuda_device_info ggml_cuda_init() { for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) CUdevice device; CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); @@ -179,7 +194,7 @@ static ggml_cuda_device_info ggml_cuda_init() { alloc_prop.location.id = id; CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); } -#endif // !defined(GGML_USE_HIPBLAS) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) info.devices[id].vmm = !!device_vmm; cudaDeviceProp prop; @@ -315,7 +330,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { }; // pool with virtual memory -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) struct ggml_cuda_pool_vmm : public ggml_cuda_pool { static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB @@ -409,14 +424,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); } }; -#endif // !defined(GGML_USE_HIPBLAS) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) { -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) if (ggml_cuda_info().devices[device].vmm) { return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device)); } -#endif +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA) return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device)); } @@ -1341,7 +1356,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { static cudaError_t ggml_cuda_Memcpy2DPeerAsync( void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices cudaMemcpy3DPeerParms p = {}; p.dstDevice = dstDevice; @@ -1355,7 +1370,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync( GGML_UNUSED(dstDevice); GGML_UNUSED(srcDevice); return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream); -#endif // !defined(GGML_USE_HIPBLAS) +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) } static void ggml_cuda_op_mul_mat( @@ -1486,7 +1501,7 @@ static void ggml_cuda_op_mul_mat( } // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared: - if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { + if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00); const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING); CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream)); @@ -1596,7 +1611,7 @@ static void ggml_cuda_op_mul_mat( CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (quantize_src1 && !src1_is_contiguous) { @@ -1828,6 +1843,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co } } #else +#ifdef GGML_USE_MUSA + GGML_ASSERT(false); +#else // !GGML_USE_MUSA if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) { // there is no broadcast and src0, src1 are contiguous across dims 2, 3 // use cublasGemmStridedBatchedEx @@ -1870,6 +1888,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } +#endif // GGML_USE_MUSA #endif if (dst->op_params[0] == GGML_PREC_DEFAULT) { @@ -1881,10 +1900,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); - bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) + bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2 - && src1->ne[1] == 1; + && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; @@ -2340,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, } GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { - GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst)); - ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; - if (!ggml_backend_buffer_is_cuda(src->buffer)) { + if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) { return false; } - if (!ggml_backend_buffer_is_cuda(dst->buffer)) { + if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) { return false; } - // device -> device + // device -> device copy ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context; ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context; - if (backend_src != backend_dst) { - ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; - ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; + ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; + ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; - GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device); - GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device); + if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { +#ifndef NDEBUG + GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); +#endif + return false; + } + if (backend_src != backend_dst) { // copy on src stream if (cuda_ctx_src->device == cuda_ctx_dst->device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } else { #ifdef GGML_CUDA_NO_PEER_COPY return false; @@ -2375,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ #endif } - // record event on src stream + // record event on src stream after the copy if (!cuda_ctx_src->copy_event) { ggml_cuda_set_device(cuda_ctx_src->device); CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming)); @@ -2387,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0)); } else { // src and dst are on the same backend - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } return true; } @@ -2724,11 +2744,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_MUL_MAT_ID: { struct ggml_tensor * a = op->src[0]; - if (op->op == GGML_OP_MUL_MAT) { - struct ggml_tensor * b = op->src[1]; - if (a->ne[3] != b->ne[3]) { - return false; - } + struct ggml_tensor * b = op->src[1]; + if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { + return false; + } + if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) { + return false; } switch (a->type) { case GGML_TYPE_F32: @@ -2867,7 +2888,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return true; case GGML_OP_FLASH_ATTN_EXT: #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128; + return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128; #else if (op->src[0]->ne[0] == 128) { return true; @@ -2953,7 +2974,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); #endif - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -3035,7 +3056,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size return false; } -#if CUDART_VERSION >= 11100 +#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); if (err != cudaSuccess) { // clear the error diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 15757ca1..607ded85 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co } else if (order == GGML_SORT_ORDER_DESC) { k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 76cc01b2..62d115f1 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast( } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 07a53bcd..9aff6c13 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -12,6 +12,10 @@ #else #define GGML_COMMON_DECL_CUDA #define GGML_COMMON_IMPL_CUDA +#if defined(GGML_USE_MUSA) +#define GGML_COMMON_DECL_MUSA +#define GGML_COMMON_IMPL_MUSA +#endif #endif #include "ggml-common.h" @@ -23,111 +27,11 @@ #include <vector> #if defined(GGML_USE_HIPBLAS) -#include <hip/hip_runtime.h> -#include <hipblas/hipblas.h> -#include <hip/hip_fp16.h> -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 -#define cublasCreate hipblasCreate -#define cublasDestroy hipblasDestroy -#define cublasGemmEx hipblasGemmEx -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaEventDestroy hipEventDestroy -#define cudaFree hipFree -#define cudaFreeHost hipHostFree -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaHostRegister hipHostRegister -#define cudaHostRegisterPortable hipHostRegisterPortable -#define cudaHostRegisterReadOnly hipHostRegisterReadOnly -#define cudaHostUnregister hipHostUnregister -#define cudaLaunchHostFunc hipLaunchHostFunc -#define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) -#define cudaMemcpy hipMemcpy -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyPeerAsync hipMemcpyPeerAsync -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemcpyKind hipMemcpyKind -#define cudaMemset hipMemset -#define cudaMemsetAsync hipMemsetAsync -#define cudaMemGetInfo hipMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaSetDevice hipSetDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamFireAndForget hipStreamFireAndForget -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamPerThread hipStreamPerThread -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define __trap() do { abort(); __builtin_unreachable(); } while(0) -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#include "vendors/hip.h" +#elif defined(GGML_USE_MUSA) +#include "vendors/musa.h" #else -#include <cuda_runtime.h> -#include <cuda.h> -#include <cublas_v2.h> -#include <cuda_fp16.h> - -#if CUDART_VERSION < 11020 -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t -#endif // CUDART_VERSION < 11020 - +#include "vendors/cuda.h" #endif // defined(GGML_USE_HIPBLAS) #define STRINGIZE_IMPL(...) #__VA_ARGS__ @@ -168,7 +72,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) -#if CUDART_VERSION >= 12000 +#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { return cublasGetStatusString(err); } @@ -200,7 +104,7 @@ static const char * cu_get_error_str(CUresult err) { #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) #endif -#if CUDART_VERSION >= 11100 +#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) #define GGML_CUDA_ASSUME(x) __builtin_assume(x) #else #define GGML_CUDA_ASSUME(x) @@ -212,93 +116,7 @@ typedef half2 dfloat2; #else typedef float dfloat; // dequantize float typedef float2 dfloat2; -#endif //GGML_CUDA_F16 - -#if defined(GGML_USE_HIPBLAS) -#define __CUDA_ARCH__ 1300 - -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ - defined(__gfx1150__) || defined(__gfx1151__) -#define RDNA3 -#endif - -#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) -#define RDNA2 -#endif - -#if defined(__gfx1010__) || defined(__gfx1012__) -#define RDNA1 -#endif - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4(const int a, const int b) { - const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); - const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast<const int &>(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max(); - if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min(); - c[i] = tmp; - } - return reinterpret_cast<int &>(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} - -static __device__ __forceinline__ int __vsub4(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a); - const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a); - const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} - -#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -// __shfl_xor() for half2 was added in ROCm 5.6 -static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { - typedef union half2_b32 { - half2 val; - int b32; - } half2_b32_t; - half2_b32_t tmp; - tmp.val = var; - tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); - return tmp.val; -} -#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 -#endif // defined(GGML_USE_HIPBLAS) +#endif // GGML_CUDA_F16 #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #define FP16_AVAILABLE @@ -348,7 +166,7 @@ static __device__ void no_device_code( #ifdef __CUDA_ARCH__ #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) #else -#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") +#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.") #endif // __CUDA_ARCH__ static __device__ __forceinline__ float warp_reduce_sum(float x) { @@ -455,7 +273,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); return mask_low | mask_high; } -#endif // CUDART_VERSION < 12000 +#endif // CUDART_VERSION < CUDART_HMASK static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index f76c80dc..70305404 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -162,7 +162,6 @@ static __global__ void dequantize_block_iq2_tn(const void * __restrict__ vx, dst const int64_t tid = threadIdx.x; const int64_t n = tid/32; const int64_t l = tid - 32*n; - const int64_t is = 8*n + l/16; const uint8_t q = x[i].qs[32*n + l]; dst_t * y = yy + i*QK_K + 128*n; diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 3db57034..aad34bfe 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 174489e0..96a5adef 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -500,7 +500,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead const dim3 block_nums(block_num_y, 1, 1); @@ -510,7 +510,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -519,7 +519,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -528,7 +528,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -537,7 +537,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, } static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -588,7 +588,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f } static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec( convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } @@ -672,3 +672,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec( GGML_UNUSED(src1_ncols); GGML_UNUSED(src1_padded_row_size); } + +bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) { + return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 || + src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 || + src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K || + src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K || + src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K || + src0_type == GGML_TYPE_F16; +} diff --git a/ggml/src/ggml-cuda/dmmv.cuh b/ggml/src/ggml-cuda/dmmv.cuh index 4c5ebd47..e727eb97 100644 --- a/ggml/src/ggml-cuda/dmmv.cuh +++ b/ggml/src/ggml-cuda/dmmv.cuh @@ -16,3 +16,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); + +bool ggml_cuda_dmmv_type_supported(ggml_type src0_type); diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index f24312dd..950fd93d 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) { fprintf(stderr, "Unsupported KV type combination for head_size 64.\n"); fprintf(stderr, "By default only f16 KV cache is supported.\n"); fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else if (D == 128) { fprintf(stderr, "Unsupported KV type combination for head_size 128.\n"); fprintf(stderr, "Supported combinations:\n"); @@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) { fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n"); fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n"); fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { fprintf(stderr, "Unsupported KV type combination for head_size 256.\n"); fprintf(stderr, "Only f16 is supported.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index c6c35134..1b2fd500 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128."); } break; } } diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index 15e22f49..f3e68dbf 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128."); } break; } } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 38d30b21..29f608b0 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } else { @@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); // break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } return; @@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } return; @@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 55af195f..4c370323 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { break; default: // TODO: k-quants - fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ASSERT(false); + GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); break; } } diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu index 29721cdd..c567ad1a 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cu +++ b/ggml/src/ggml-cuda/iqk_mmvq.cu @@ -466,7 +466,6 @@ __device__ __forceinline__ float vec_dot_iq3_k_q8_1( const int * q8; int sumi[4] = {0, 0, 0, 0}; - uint16_t v1, v2; int v; for (int i = 0; i < 2; ++i) { uint32_t vl = ql[2*i+0] | (ql[2*i+1] << 16); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 84f6387e..78d70cd7 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q( mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index f08a4758..e8a95744 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { case GGML_TYPE_IQ4_NL: return MMQ_Q8_1_DS_LAYOUT_D4; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda break; default: fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 9eb3fa4f..2586ab7e 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -165,7 +165,7 @@ static void mul_mat_vec_q_cuda( rows_per_cuda_block = 2; break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -199,7 +199,7 @@ static void mul_mat_vec_q_cuda( mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -454,7 +454,7 @@ void ggml_cuda_op_mul_mat_vec_q( mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 30866d51..133e219f 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -142,8 +142,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i } } -static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) { - static const float eps = 1e-6f; +static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) { if (group_size < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps); @@ -196,8 +195,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) GGML_ASSERT( dst->type == GGML_TYPE_F32); int num_groups = dst->op_params[0]; + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream); + group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream); } void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index aa7f1eff..45408ce8 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda( <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 596fb7c1..99ec1dd9 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { attn_factor, corr_dims, freq_factors, stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { if (src0->type == GGML_TYPE_F32) { @@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { attn_factor, corr_dims, freq_factors, stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h new file mode 100644 index 00000000..db9f6a16 --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/cuda.h @@ -0,0 +1,14 @@ +#pragma once + +#include <cuda_runtime.h> +#include <cuda.h> +#include <cublas_v2.h> +#include <cuda_fp16.h> + +#if CUDART_VERSION < 11020 +#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t +#endif // CUDART_VERSION < 11020 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h new file mode 100644 index 00000000..d0c37725 --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -0,0 +1,177 @@ +#pragma once + +#include <hip/hip_runtime.h> +#include <hipblas/hipblas.h> +#include <hip/hip_fp16.h> +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterPortable hipHostRegisterPortable +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly +#define cudaHostUnregister hipHostUnregister +#define cudaLaunchHostFunc hipLaunchHostFunc +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyPeerAsync hipMemcpyPeerAsync +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaMemGetInfo hipMemGetInfo +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamFireAndForget hipStreamFireAndForget +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamPerThread hipStreamPerThread +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define __trap() do { abort(); __builtin_unreachable(); } while(0) +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED + +#define __CUDA_ARCH__ 1300 + +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ + defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3 +#endif + +#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) +#define RDNA2 +#endif + +#if defined(__gfx1010__) || defined(__gfx1012__) +#define RDNA1 +#endif + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); + const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); +#if __has_builtin(__builtin_elementwise_sub_sat) + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast<const int &>(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max(); + if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min(); + c[i] = tmp; + } + return reinterpret_cast<int &>(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) +} + +static __device__ __forceinline__ int __vsub4(const int a, const int b) { + return __vsubss4(a, b); +} + +static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a); + const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + +static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a); + const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0x00 : 0xff; + } + return c; +} + +#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 +// __shfl_xor() for half2 was added in ROCm 5.6 +static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { + typedef union half2_b32 { + half2 val; + int b32; + } half2_b32_t; + half2_b32_t tmp; + tmp.val = var; + tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); + return tmp.val; +} +#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h new file mode 100644 index 00000000..e50a103a --- /dev/null +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -0,0 +1,171 @@ +#pragma once + +#include <musa_runtime.h> +#include <musa.h> +#include <mublas.h> +#include <musa_fp16.h> +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F +#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N MUBLAS_OP_N +#define CUBLAS_OP_T MUBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT +#define CUDA_R_16F MUSA_R_16F +#define CUDA_R_32F MUSA_R_32F +#define cublasComputeType_t cudaDataType_t +#define cublasCreate mublasCreate +#define cublasDestroy mublasDestroy +#define cublasGemmEx mublasGemmEx +#define cublasGemmBatchedEx mublasGemmBatchedEx +#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx +#define cublasHandle_t mublasHandle_t +#define cublasSetMathMode mublasSetMathMode +#define cublasSetStream mublasSetStream +#define cublasSgemm mublasSgemm +#define cublasStatus_t mublasStatus_t +#define cublasGetStatusString mublasStatus_to_string +#define cudaDataType_t musaDataType_t +#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess +#define cudaDeviceProp musaDeviceProp +#define cudaDeviceSynchronize musaDeviceSynchronize +#define cudaError_t musaError_t +#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled +#define cudaEventCreateWithFlags musaEventCreateWithFlags +#define cudaEventDisableTiming musaEventDisableTiming +#define cudaEventRecord musaEventRecord +#define cudaEventSynchronize musaEventSynchronize +#define cudaEvent_t musaEvent_t +#define cudaEventDestroy musaEventDestroy +#define cudaFree musaFree +#define cudaFreeHost musaFreeHost +#define cudaGetDevice musaGetDevice +#define cudaGetDeviceCount musaGetDeviceCount +#define cudaGetDeviceProperties musaGetDeviceProperties +#define cudaGetErrorString musaGetErrorString +#define cudaGetLastError musaGetLastError +#define cudaHostRegister musaHostRegister +#define cudaHostRegisterPortable musaHostRegisterPortable +#define cudaHostRegisterReadOnly musaHostRegisterReadOnly +#define cudaHostUnregister musaHostUnregister +#define cudaLaunchHostFunc musaLaunchHostFunc +#define cudaMalloc musaMalloc +#define cudaMallocHost musaMallocHost +#define cudaMemcpy musaMemcpy +#define cudaMemcpyAsync musaMemcpyAsync +#define cudaMemcpyPeerAsync musaMemcpyPeerAsync +#define cudaMemcpy2DAsync musaMemcpy2DAsync +#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost +#define cudaMemcpyHostToDevice musaMemcpyHostToDevice +#define cudaMemcpyKind musaMemcpyKind +#define cudaMemset musaMemset +#define cudaMemsetAsync musaMemsetAsync +#define cudaMemGetInfo musaMemGetInfo +#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize +#define cudaSetDevice musaSetDevice +#define cudaStreamCreateWithFlags musaStreamCreateWithFlags +#define cudaStreamDestroy musaStreamDestroy +#define cudaStreamFireAndForget musaStreamFireAndForget +#define cudaStreamNonBlocking musaStreamNonBlocking +#define cudaStreamPerThread musaStreamPerThread +#define cudaStreamSynchronize musaStreamSynchronize +#define cudaStreamWaitEvent musaStreamWaitEvent +#define cudaStream_t musaStream_t +#define cudaSuccess musaSuccess + +// Additional mappings for MUSA virtual memory pool +#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE +#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED +#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED +#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE +#define CUdevice MUdevice +#define CUdeviceptr MUdeviceptr +#define CUmemAccessDesc MUmemAccessDesc +#define CUmemAllocationProp MUmemAllocationProp +#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle +#define cuDeviceGet muDeviceGet +#define cuDeviceGetAttribute muDeviceGetAttribute +#define cuMemAddressFree muMemAddressFree +#define cuMemAddressReserve muMemAddressReserve +#define cuMemCreate muMemCreate +#define cuMemGetAllocationGranularity muMemGetAllocationGranularity +#define cuMemMap muMemMap +#define cuMemRelease muMemRelease +#define cuMemSetAccess muMemSetAccess +#define cuMemUnmap muMemUnmap +#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize +#define cudaFuncSetAttribute musaFuncSetAttribute +#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms +#define make_cudaExtent make_musaExtent +#define make_cudaPitchedPtr make_musaPitchedPtr + +// Additional mappings for MUSA graphs +#define CUDA_SUCCESS MUSA_SUCCESS +#define CUresult MUresult +#define cuGetErrorString muGetErrorString +#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure +#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction +#define cudaGraphDestroy musaGraphDestroy +#define cudaGraphExecDestroy musaGraphExecDestroy +#define cudaGraphExec_t musaGraphExec_t +#define cudaGraphExecUpdate musaGraphExecUpdate +#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult +#define cudaGraphGetNodes musaGraphGetNodes +#define cudaGraphInstantiate musaGraphInstantiate +#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams +#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams +#define cudaGraphLaunch musaGraphLaunch +#define cudaGraphNodeGetType musaGraphNodeGetType +#define cudaGraphNode_t musaGraphNode_t +#define cudaGraphNodeType musaGraphNodeType +#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel +#define cudaGraph_t musaGraph_t +#define cudaKernelNodeParams musaKernelNodeParams +#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed +#define cudaStreamEndCapture musaStreamEndCapture + +// XXX: Clang builtins mapping +#define __vsub4 __vsub4_musa +#define __vcmpeq4 __vcmpeq4_musa +#define __vcmpne4 __vcmpne4_musa + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); + +static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { + return __vsubss4(a, b); +} + +static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a); + const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + +static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a); + const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0x00 : 0xff; + } + return c; +} diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a2c8dbec..190af081 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { /** * Converts float32 to brain16. * - * This function is binary identical to AMD Zen4 VCVTNEPS2BF16. - * Subnormals shall be flushed to zero, and NANs will be quiet. + * This is binary identical with Google Brain float conversion. + * Floats shall round to nearest even, and NANs shall be quiet. + * Subnormals aren't flushed to zero, except perhaps when used. * This code should vectorize nicely if using modern compilers. */ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { @@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } - if (!(u.i & 0x7f800000)) { /* subnormal */ - h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */ - return h; - } h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; return h; } @@ -146,6 +143,7 @@ extern "C" { #if defined(__ARM_FEATURE_SVE) #include <arm_sve.h> +#include <sys/prctl.h> #endif // 16-bit float @@ -634,21 +632,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif -#define GGML_HASHTABLE_FULL ((size_t)-1) -#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2) +// bitset + +static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); +#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) +#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) + +static size_t ggml_bitset_size(size_t n) { + return (n + BITSET_MASK) >> BITSET_SHR; +} + +static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) { + return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK))); +} + +static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) { + bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK)); +} + +static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { + bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK)); +} + +// hash set + +#define GGML_HASHSET_FULL ((size_t)-1) +#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) struct ggml_hash_set ggml_hash_set_new(size_t size); +void ggml_hash_set_free(struct ggml_hash_set * hash_set); -bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); +// returns the minimum size for a hash set that can hold min_sz elements +size_t ggml_hash_size(size_t min_sz); -// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted -size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key); +// remove all elements from the hash set +void ggml_hash_set_reset(struct ggml_hash_set * hash_set); -// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full -size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key); +// returns true if key is in the hash set +static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted +static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full +static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); // return index, asserts if table is full -size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key); +static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// hash function for ggml_tensor +static inline size_t ggml_hash(const struct ggml_tensor * p) { + // the last 4 bits are always zero due to alignment + return (size_t)(uintptr_t)p >> 4; +} + +static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) { + i = (i + 1) % hash_set->size; + if (i == h) { + // visited all hash table entries -> not found + return GGML_HASHSET_FULL; + } + } + return i; +} + +static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t i = ggml_hash_find(hash_set, key); + return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i); +} + +static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + do { + if (!ggml_bitset_get(hash_set->used, i)) { + ggml_bitset_set(hash_set->used, i); + hash_set->keys[i] = key; + return i; + } + if (hash_set->keys[i] == key) { + return GGML_HASHSET_ALREADY_EXISTS; + } + i = (i + 1) % hash_set->size; + } while (i != h); + + // visited all hash table entries -> not found + GGML_ABORT("fatal error"); +} + +static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + do { + if (!ggml_bitset_get(hash_set->used, i)) { + ggml_bitset_set(hash_set->used, i); + hash_set->keys[i] = key; + return i; + } + if (hash_set->keys[i] == key) { + return i; + } + i = (i + 1) % hash_set->size; + } while (i != h); + + // visited all hash table entries -> not found + GGML_ABORT("fatal error"); +} #ifdef __cplusplus } diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index ed5f2e34..41ac63fa 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) { } if ((a % b) != 0) { fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b); - GGML_ASSERT(!"safe_divide result would've had remainder"); + GGML_ABORT("safe_divide result would've had remainder"); } return a / b; } @@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml if (!ggml_vk_supports_op(dst)) { fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ASSERT(!"unsupported op"); + GGML_ABORT("unsupported op"); } const int32_t ne00 = src0 ? src0->ne[0] : 0; @@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml default: { fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } break; @@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml continue; not_implemented: {} fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - //GGML_ASSERT(false); + //GGML_ABORT("fatal error"); } // Evaluate sequence diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 7d592c22..292f9ac7 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -260,7 +260,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_COUNT }; -struct ggml_metal_context { +struct ggml_backend_metal_context { int n_cb; id<MTLDevice> device; @@ -274,6 +274,10 @@ struct ggml_metal_context { bool support_simdgroup_mm; bool should_capture_next_compute; + + // abort ggml_metal_graph_compute if callback returns true + ggml_abort_callback abort_callback; + void * abort_callback_data; }; // MSL code @@ -339,7 +343,7 @@ static void * ggml_metal_host_malloc(size_t n) { return data; } -static struct ggml_metal_context * ggml_metal_init(int n_cb) { +static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); #if TARGET_OS_OSX && !GGML_METAL_NDEBUG @@ -356,7 +360,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // Configure context - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); ctx->device = device; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; @@ -761,7 +765,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { return ctx; } -static void ggml_metal_free(struct ggml_metal_context * ctx) { +static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) { @@ -827,7 +831,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs return nil; } -static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) { +static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) { for (size_t i = 0, n = 3; i < n; ++i) { if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { return false; @@ -938,7 +942,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const } static enum ggml_status ggml_metal_graph_compute( - struct ggml_metal_context * ctx, + struct ggml_backend_metal_context * ctx, struct ggml_cgraph * gf) { @autoreleasepool { @@ -962,7 +966,7 @@ static enum ggml_status ggml_metal_graph_compute( NSError * error = nil; if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); - GGML_ASSERT(!"capture failed"); + GGML_ABORT("capture failed"); } } @@ -971,8 +975,11 @@ static enum ggml_status ggml_metal_graph_compute( id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; command_buffer_builder[cb_idx] = command_buffer; - // enqueue the command buffers in order to specify their execution order - [command_buffer enqueue]; + // always enqueue the first two command buffers + // enqueue all of the command buffers if we don't need to abort + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer enqueue]; + } } const id<MTLCommandBuffer> *command_buffers = command_buffer_builder; @@ -1024,7 +1031,7 @@ static enum ggml_status ggml_metal_graph_compute( if (!ggml_metal_supports_op(ctx, dst)) { GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ASSERT(!"unsupported op"); + GGML_ABORT("unsupported op"); } if (should_capture) { @@ -1207,7 +1214,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } bcast_row = true; @@ -1216,7 +1223,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } } @@ -1270,7 +1277,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } [encoder setComputePipelineState:pipeline]; @@ -1526,7 +1533,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_SQR: @@ -1756,7 +1763,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F32 ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F32 ].pipeline; break; case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ6_K_F32 ].pipeline; break; - default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); + default: GGML_ABORT("MUL MAT-MAT not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -1977,7 +1984,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); - GGML_ASSERT(false && "not implemented"); + GGML_ABORT("not implemented"); } }; @@ -2117,7 +2124,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_K_F32 ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_K_F32 ].pipeline; break; case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ6_K_F32 ].pipeline; break; - default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); + default: GGML_ABORT("MUL_MAT_ID not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -2332,7 +2339,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); - GGML_ASSERT(false && "not implemented"); + GGML_ABORT("not implemented"); } }; @@ -2443,7 +2450,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_K ].pipeline; break; case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ6_K ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -2494,10 +2501,8 @@ static enum ggml_status ggml_metal_graph_compute( GGML_ASSERT(ne00 % 4 == 0); GGML_ASSERT(ggml_is_contiguous(src0)); - //float eps; - //memcpy(&eps, dst->op_params, sizeof(float)); - - const float eps = 1e-6f; // TODO: temporarily hardcoded + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); const int32_t n_groups = ((int32_t *) dst->op_params)[0]; @@ -2581,13 +2586,13 @@ static enum ggml_status ggml_metal_graph_compute( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; } else { switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; } @@ -2664,7 +2669,7 @@ static enum ggml_status ggml_metal_graph_compute( switch (dst->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; [encoder setComputePipelineState:pipeline]; @@ -2821,7 +2826,7 @@ static enum ggml_status ggml_metal_graph_compute( switch (order) { case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; [encoder setComputePipelineState:pipeline]; @@ -2910,7 +2915,7 @@ static enum ggml_status ggml_metal_graph_compute( { GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ASSERT(false && "add template specialization for this size"); + GGML_ABORT("add template specialization for this size"); } } } else { @@ -2923,7 +2928,7 @@ static enum ggml_status ggml_metal_graph_compute( { GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ASSERT(false && "add template specialization for this size"); + GGML_ABORT("add template specialization for this size"); } } } @@ -3044,7 +3049,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); }; } break; case GGML_TYPE_F16: @@ -3052,10 +3057,10 @@ static enum ggml_status ggml_metal_graph_compute( switch (dstt) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); }; } break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -3083,7 +3088,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -3094,7 +3099,9 @@ static enum ggml_status ggml_metal_graph_compute( [encoder endEncoding]; - [command_buffer commit]; + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer commit]; + } }); // Wait for completion and check status of each command buffer @@ -3114,6 +3121,23 @@ static enum ggml_status ggml_metal_graph_compute( return GGML_STATUS_FAILED; } + + id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil); + if (!next_buffer) { + continue; + } + + bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } + + [next_buffer commit]; } if (should_capture) { @@ -3417,7 +3441,7 @@ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) { } GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) { - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ggml_metal_free(ctx); free(backend); } @@ -3429,13 +3453,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe } GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); } GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; return ggml_metal_supports_op(metal_ctx, op); } @@ -3480,9 +3504,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) { } ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); - + struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); if (ctx == NULL) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return NULL; } @@ -3504,15 +3528,24 @@ bool ggml_backend_is_metal(ggml_backend_t backend) { void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } +void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) { + GGML_ASSERT(ggml_backend_is_metal(backend)); + + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; + + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = user_data; +} + bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } @@ -3520,7 +3553,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ctx->should_capture_next_compute = true; } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 99bd682f..41362dee 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3952,7 +3952,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (ggml_sve_cnt_b == QK8_0) { const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); @@ -4324,15 +4324,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); #endif for (; ib < nb; ++ib) { - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F) - 8; const int v1 = (x[ib].qs[j] >> 4) - 8; - sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); } @@ -4613,15 +4616,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_8(acc) + summs; #endif for (; ib < nb; ++ib) { - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F); const int v1 = (x[ib].qs[j] >> 4); - sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); } @@ -4967,18 +4973,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16; + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); - sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; } @@ -5343,7 +5352,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; @@ -5352,9 +5362,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; - sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); } @@ -5445,7 +5457,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (svcntb() == QK8_0) { + if (ggml_sve_cnt_b == QK8_0) { svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); @@ -6591,22 +6603,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r // compute mask for subtraction vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl); - vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl); + vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl); m <<= 1; vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl); + vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl); m <<= 1; vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl); + vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl); m <<= 1; vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl); - vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl); + vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl); m <<= 1; // load Q8 and take product with Q3 @@ -7862,13 +7874,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl)); vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl); + vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl); m <<= 1; vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl)); vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl); + vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl); m <<= 1; vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl); @@ -12851,7 +12863,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } q2[2*ib+0] |= ((uint32_t) grid_index << 8*k); q2[2*ib+1] |= (block_signs[k] << 7*k); @@ -13030,7 +13042,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } q2[2*ib+k] = grid_index | (block_signs[k] << 9); } @@ -13473,7 +13485,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (grid_size == 256) { q3[8*ib+k] = grid_index; @@ -13686,7 +13698,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } qs[k] = grid_index & 255; qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8)); @@ -14662,7 +14674,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int i8 = 2*ib + k; y[ibl].qs[i8] = grid_index & 255; @@ -14782,7 +14794,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte } if (nbytes % ggml_type_size(type) != 0) { - fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type); + fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type)); return false; } diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 91063633..775aa875 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -146,6 +146,10 @@ void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); +#if defined(__ARM_FEATURE_SVE) +extern int ggml_sve_cnt_b; +#endif + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index b01ad267..7757615f 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por fprintf(stderr, "Failed to set SO_REUSEADDR\n"); return nullptr; } + if (inet_addr(host) == INADDR_NONE) { + fprintf(stderr, "Invalid host address: %s\n", host); + return nullptr; + } struct sockaddr_in serv_addr; serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = inet_addr(host); @@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp if (result->buffer && buffers.find(result->buffer) == buffers.end()) { return nullptr; } + + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + result->op = (ggml_op) tensor->op; for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { result->op_params[i] = tensor->op_params[i]; @@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) { const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); uint64_t offset; memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); - size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); + const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); struct ggml_init_params params { /*.mem_size =*/ ggml_tensor_overhead(), @@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) { return false; } GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); + + // sanitize tensor->data + { + const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); + const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); + + if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { + GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + } + } + const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset); ggml_backend_tensor_set(tensor, data, offset, size); ggml_free(ctx); @@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint return false; } GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); + + // sanitize tensor->data + { + const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); + const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); + + if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { + GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + } + } + // output serialization format: | data (size bytes) | output.resize(size, 0); ggml_backend_tensor_get(tensor, output.data(), offset, size); diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 36518ff9..d8eb86c2 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, }); }); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, // GGML_SYCL_DEBUG("current device index %d\n", id); src_ptr = (char *) extra->data_device[id]; } else { - // GGML_SYCL_DEBUG("GGML_ASSERT(false)\n"); - GGML_ASSERT(false); + // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n"); + GGML_ABORT("fatal error"); } char * dst_ptr = (char *) dst; @@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te default: // TODO: k-quants fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC case GGML_TYPE_Q6_K: return 64; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -3101,7 +3101,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten SYCL_CHECK(ggml_sycl_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (convert_src1_to_q8_1 && !src1_is_contiguous) { @@ -3896,7 +3896,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } (void) dst; @@ -3981,6 +3981,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_func_t func; switch (tensor->op) { + case GGML_OP_CONV_TRANSPOSE_1D: + func = ggml_sycl_op_conv_transpose_1d; + break; case GGML_OP_REPEAT: func = ggml_sycl_repeat; break; @@ -4105,6 +4108,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_OP_ARGSORT: func = ggml_sycl_argsort; break; + case GGML_OP_TIMESTEP_EMBEDDING: + func = ggml_sycl_op_timestep_embedding; + break; default: return false; } @@ -5090,6 +5096,15 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { + case GGML_OP_CONV_TRANSPOSE_1D: + { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + return false; + } break; case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: @@ -5213,6 +5228,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_LEAKY_RELU: + case GGML_OP_TIMESTEP_EMBEDDING: return true; default: return false; diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 067181de..58dd9c9a 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -15,6 +15,7 @@ #include "concat.hpp" #include "common.hpp" +#include "conv.hpp" #include "convert.hpp" #include "dequantize.hpp" #include "dmmv.hpp" @@ -23,5 +24,6 @@ #include "rope.hpp" #include "norm.hpp" #include "softmax.hpp" +#include "tsembd.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 397bd98d..86d8b40e 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -100,7 +100,7 @@ static void crash() { const char* msg) { fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg); fprintf(stderr, " in function %s at %s:%d\n", func, file, line); - GGML_ASSERT(!"SYCL error"); + GGML_ABORT("SYCL error"); } #define SYCL_CHECK(err) \ diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp new file mode 100644 index 00000000..bc4ab1dd --- /dev/null +++ b/ggml/src/ggml-sycl/conv.cpp @@ -0,0 +1,99 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "conv.hpp" + +static void conv_transpose_1d_kernel( + const int s0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, + const int src1_ne0, const int dst_ne0, + const float * src0, const float * src1, float * dst, + const sycl::nd_item<3> &item_ct1) { + int global_index = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (global_index >= output_size) { + return; + } + + int out_index = global_index / dst_ne0; + + float accumulator = 0; + + for (int c = 0; c < src0_ne2; c++) { + int idx = global_index % dst_ne0; + + int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0); + int input_offset = src1_ne0 * c; + + for (int i = 0; i < src1_ne0; i++) { + if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) { + continue; + } + int weight_idx = idx - i*s0; + + float kernel_weight = src0[kernel_offset + weight_idx]; + float input_value = src1[input_offset+i]; + + accumulator += kernel_weight * input_value; + } + } + dst[global_index] = accumulator; +} + +static void conv_transpose_1d_f32_f32_sycl( + const int s0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, + const int src1_ne0, const int dst_ne0, + const float *src0, const float *src1, float *dst, + const queue_ptr& stream) { + + const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE; + const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE); + const sycl::range<3> block_nums(1, 1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>( + block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + conv_transpose_1d_kernel( + s0, output_size, + src0_ne0, src0_ne1, src0_ne2, + src1_ne0, dst_ne0, + src0, src1, dst, item_ct1); + }); +} + +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst) { + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + + float * dst_d = (float *)dst->data; + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + const int32_t * opts = (const int32_t *)dst->op_params; + + const int s0 = opts[0]; + + const int64_t output_size = ggml_nelements(dst); + + conv_transpose_1d_f32_f32_sycl(s0, output_size, + src0->ne[0], src0->ne[1], src0->ne[2], + src1->ne[0], dst->ne[0], + src0_d, src1_d, dst_d, stream); +} + diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp new file mode 100644 index 00000000..eb20730f --- /dev/null +++ b/ggml/src/ggml-sycl/conv.hpp @@ -0,0 +1,21 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONV_HPP +#define GGML_SYCL_CONV_HPP + +#include "common.hpp" + +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst); + +#endif // GGML_SYCL_CONV_HPP diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 70a94fc1..ae45630e 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec( break; default: printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 4aaa76bf..fe4a8f74 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -874,7 +874,7 @@ namespace dpct inline std::string get_preferred_gpu_platform_name() { std::string result; - std::string filter = "level-zero"; + std::string filter = ""; char* env = getenv("ONEAPI_DEVICE_SELECTOR"); if (env) { if (std::strstr(env, "level_zero")) { @@ -892,11 +892,24 @@ namespace dpct else { throw std::runtime_error("invalid device filter: " + std::string(env)); } + } else { + auto default_device = sycl::device(sycl::default_selector_v); + auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>(); + + if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) { + filter = "level-zero"; + } + else if (std::strstr(default_platform_name.c_str(), "CUDA")) { + filter = "cuda"; + } + else if (std::strstr(default_platform_name.c_str(), "HIP")) { + filter = "hip"; + } } - auto plaform_list = sycl::platform::get_platforms(); + auto platform_list = sycl::platform::get_platforms(); - for (const auto& platform : plaform_list) { + for (const auto& platform : platform_list) { auto devices = platform.get_devices(); auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) { return d.is_gpu(); @@ -975,7 +988,7 @@ namespace dpct if (backend == "opencl:cpu") return 4; if (backend == "opencl:acc") return 5; printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } static bool compare_backend(std::string &backend1, std::string &backend2) { return convert_backend_index(backend1) < convert_backend_index(backend2); diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp index 3107ba91..e952533d 100644 --- a/ggml/src/ggml-sycl/mmq.cpp +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_0_PASCAL; nwarps = NWARPS_Q4_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_1_PASCAL; nwarps = NWARPS_Q4_1_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_0_PASCAL; nwarps = NWARPS_Q5_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_1_PASCAL; nwarps = NWARPS_Q5_1_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q8_0_PASCAL; nwarps = NWARPS_Q8_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q2_K_PASCAL; nwarps = NWARPS_Q2_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q3_K_PASCAL; nwarps = NWARPS_Q3_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_K_PASCAL; nwarps = NWARPS_Q4_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_K_PASCAL; nwarps = NWARPS_Q5_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q6_K_PASCAL; nwarps = NWARPS_Q6_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q( ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 3fbc4dd6..1b96925e 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -902,7 +902,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>( + mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>( vx, vy, dst, ncols, nrows, item_ct1); }); }); @@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q( mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index cccf87d0..b3159b9d 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -225,9 +225,8 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols, } static void group_norm_f32_sycl(const float* x, float* dst, - const int num_groups, const int group_size, + const int num_groups, const float eps, const int group_size, const int ne_elements, queue_ptr stream, int device) { - static const float eps = 1e-6f; if (group_size < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->submit([&](sycl::handler& cgh) { @@ -343,8 +342,12 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* GGML_ASSERT(dst->type == GGML_TYPE_F32); int num_groups = dst->op_params[0]; + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device); + group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device); (void)src1; (void)dst; diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index 15ddcac1..340ab8e9 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -41,6 +41,8 @@ #define SYCL_ACC_BLOCK_SIZE 256 #define SYCL_IM2COL_BLOCK_SIZE 256 #define SYCL_POOL2D_BLOCK_SIZE 256 +#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256 +#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec #ifndef GGML_SYCL_DMMV_X diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 6f507941..c7545bcc 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -251,7 +251,7 @@ void ggml_sycl_op_rope( attn_factor, corr_dims, freq_factors, main_stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { if (src0->type == GGML_TYPE_F32) { @@ -265,7 +265,7 @@ void ggml_sycl_op_rope( attn_factor, corr_dims, freq_factors, main_stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp new file mode 100644 index 00000000..d5c227cd --- /dev/null +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -0,0 +1,71 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "tsembd.hpp" + +static void timestep_embedding_f32( + const float * timesteps, float * dst, const int nb1, + const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) { + // item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0] + // item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE + int i = item_ct1.get_group(1); + int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); + float * embed_data = (float *)((char *)dst + i*nb1); + + if (dim % 2 != 0 && j == ((dim + 1) / 2)) { + embed_data[dim] = 0.f; + } + + int half = dim / 2; + if (j >= half) { + return; + } + + float timestep = timesteps[i]; + float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half); + float arg = timestep * freq; + embed_data[j] = sycl::cos(arg); + embed_data[j + half] = sycl::sin(arg); +} + +static void timestep_embedding_f32_sycl( + const float * x, float * dst, const int ne00, const int nb1, + const int dim, const int max_period, const queue_ptr& stream) { + // As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad + int half_ceil = dim / 2; + int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE; + sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE); + sycl::range<3> gridDim(1, ne00, num_blocks); + stream->parallel_for( + sycl::nd_range<3>( + gridDim * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + timestep_embedding_f32( + x, dst, nb1, dim, max_period, item_ct1 + ); + }); +} + +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor * dst) { + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); +} diff --git a/ggml/src/ggml-sycl/tsembd.hpp b/ggml/src/ggml-sycl/tsembd.hpp new file mode 100644 index 00000000..ff854c33 --- /dev/null +++ b/ggml/src/ggml-sycl/tsembd.hpp @@ -0,0 +1,21 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_TSEMBD_HPP +#define GGML_SYCL_TSEMBD_HPP + +#include "common.hpp" + +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor * dst); + +#endif // GGML_SYCL_TSEMBD_HPP diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 6bcd81a7..86732837 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -177,24 +177,33 @@ struct vk_device_struct { vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16; vk_pipeline pipeline_mul_f32; vk_pipeline pipeline_div_f32; - vk_pipeline pipeline_add_f32; + vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; + vk_pipeline pipeline_upscale_f32; vk_pipeline pipeline_scale_f32; vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_clamp_f32; + vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; vk_pipeline pipeline_norm_f32; + vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; vk_pipeline pipeline_gelu_f32; + vk_pipeline pipeline_gelu_quick_f32; vk_pipeline pipeline_silu_f32; vk_pipeline pipeline_relu_f32; + vk_pipeline pipeline_leaky_relu_f32; + vk_pipeline pipeline_tanh_f32; vk_pipeline pipeline_diag_mask_inf_f32; vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; vk_pipeline pipeline_argsort_f32; vk_pipeline pipeline_sum_rows_f32; + vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; + vk_pipeline pipeline_timestep_embedding_f32; std::vector<vk_pipeline_ref> pipelines; @@ -236,8 +245,8 @@ struct vk_device_struct { }; struct vk_buffer_struct { - vk::Buffer buffer; - vk::DeviceMemory device_memory; + vk::Buffer buffer = VK_NULL_HANDLE; + vk::DeviceMemory device_memory = VK_NULL_HANDLE; vk::MemoryPropertyFlags memory_property_flags; void * ptr; size_t size = 0; @@ -259,6 +268,10 @@ struct vk_subbuffer { vk_buffer buffer; uint64_t offset; uint64_t size; + + operator vk::DescriptorBufferInfo() const { + return { buffer->buffer, offset, size }; + } }; struct vk_semaphore { @@ -320,7 +333,7 @@ struct vk_op_binary_push_constants { uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23; uint32_t d_offset; - float param1; float param2; + float param1; float param2; int32_t param3; }; struct vk_op_diag_mask_push_constants { @@ -358,6 +371,25 @@ struct vk_op_argsort_push_constants { int32_t order; }; +struct vk_op_im2col_push_constants { + uint32_t batch_offset; uint32_t offset_delta; + uint32_t IC; + uint32_t IW; uint32_t IH; + uint32_t OW; uint32_t OH; + uint32_t KW; uint32_t KH; + uint32_t pelements; + uint32_t CHW; + int32_t s0; int32_t s1; + int32_t p0; int32_t p1; + int32_t d0; int32_t d1; +}; + +struct vk_op_timestep_embedding_push_constants { + uint32_t nb1; + uint32_t dim; + uint32_t max_period; +}; + // Allow pre-recording command buffers struct vk_staging_memcpy { vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} @@ -367,28 +399,32 @@ struct vk_staging_memcpy { size_t n; }; -struct vk_context { - size_t idx; +struct vk_op_upscale_push_constants { + uint32_t ne; uint32_t d_offset; + uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; + float sf0; float sf1; float sf2; float sf3; +}; +struct vk_context_struct { vk_submission * s; std::vector<vk_sequence> seqs; - ggml_tensor * exit_tensor; + int exit_tensor_idx; std::vector<vk_staging_memcpy> in_memcpys; std::vector<vk_staging_memcpy> out_memcpys; vk_queue * q; }; +typedef std::shared_ptr<vk_context_struct> vk_context; +typedef std::weak_ptr<vk_context_struct> vk_context_ref; struct ggml_tensor_extra_gpu { - size_t ctx_idx; - vk_buffer_ref buffer_gpu; uint64_t offset; void reset() { - ctx_idx = 0; buffer_gpu.reset(); offset = 0; } @@ -459,8 +495,10 @@ struct ggml_backend_vk_context { vk_buffer buffer_pool[MAX_VK_BUFFERS]; - vk_context * compute_ctx; - vk_context * transfer_ctx; + vk_context_ref compute_ctx; + vk_context_ref transfer_ctx; + + std::vector<vk_context_ref> tensor_ctxs; }; #ifdef GGML_VULKAN_MEMORY_DEBUG @@ -510,12 +548,12 @@ static vk_instance_t vk_instance; static size_t vk_skip_checks; static size_t vk_output_tensor; -static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name); -static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor); -static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor); +static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); +static void ggml_vk_check_results_0(ggml_tensor * tensor); +static void ggml_vk_check_results_1(ggml_tensor * tensor); #endif -typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); @@ -708,11 +746,11 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s return s; } -static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { - VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")"); +static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { if (ctx->seqs.empty()) { return; } + VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); std::vector<std::vector<uint64_t>> tl_wait_vals; std::vector<std::vector<uint64_t>> tl_signal_vals; @@ -844,21 +882,17 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_ q.stage_flags = stage_flags; } -static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) { - VK_LOG_DEBUG("ggml_vk_create_context()"); - ctx->gc.contexts.emplace_back(); - vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1]; - memset((void *) result, 0, sizeof(vk_context)); - result->idx = ctx->gc.contexts.size() - 1; +static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) { + vk_context result = std::make_shared<vk_context_struct>(); + VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")"); + ctx->gc.contexts.emplace_back(result); result->q = &q; return result; } -static vk_context * ggml_vk_create_temporary_context(vk_queue& q) { - VK_LOG_DEBUG("ggml_vk_create_temporary_context()"); - vk_context * result = new vk_context; - memset((void *) result, 0, sizeof(vk_context)); - result->idx = 0; +static vk_context ggml_vk_create_temporary_context(vk_queue& q) { + vk_context result = std::make_shared<vk_context_struct>(); + VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")"); result->q = &q; return result; } @@ -915,6 +949,10 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")"); + if (size > device->max_memory_allocation_size) { + throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit"); + } + std::lock_guard<std::mutex> guard(device->mutex); vk_buffer buf = std::make_shared<vk_buffer_struct>(); @@ -1027,21 +1065,22 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { return { buf, 0, VK_WHOLE_SIZE }; } -static void ggml_vk_sync_buffers(vk_context * ctx) { +static void ggml_vk_sync_buffers(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_sync_buffers()"); - const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; - ctx->s->buffer.pipelineBarrier( ctx->q->stage_flags, ctx->q->stage_flags, {}, - mem_barriers, + { { + {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}, + {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite} + } }, {}, {} ); } -static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) { +static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) { VK_LOG_DEBUG("ggml_vk_wait_events()"); if (events.empty()) { return; @@ -1598,6 +1637,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -1605,20 +1645,31 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); @@ -1634,6 +1685,11 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); } static vk_device ggml_vk_get_device(size_t idx) { @@ -1961,7 +2017,7 @@ void ggml_vk_instance_init() { // Make sure at least one device exists if (devices.empty()) { std::cerr << "ggml_vulkan: Error: No devices found." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Default to using all dedicated GPUs @@ -2057,9 +2113,9 @@ void ggml_vk_instance_init() { } static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { - GGML_ASSERT(idx < vk_instance.device_indices.size()); VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")"); ggml_vk_instance_init(); + GGML_ASSERT(idx < vk_instance.device_indices.size()); ctx->name = GGML_VK_NAME + std::to_string(idx); @@ -2077,9 +2133,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { ctx->staging_size = 0; ctx->staging_offset = 0; - ctx->compute_ctx = nullptr; - ctx->transfer_ctx = nullptr; - #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); @@ -2112,7 +2165,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type } static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { - VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()"); + VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f32; } @@ -2126,7 +2179,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte return ctx->device->pipeline_matmul_f16; } - GGML_ASSERT(src1_type == GGML_TYPE_F32); + if (src1_type != GGML_TYPE_F32) { + return nullptr; + } switch (src0_type) { case GGML_TYPE_Q4_0: @@ -2370,28 +2425,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo return s; } -static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) { + + +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; - for (auto& buffer : buffers) { - std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), "; + for (auto& buffer : descriptor_buffer_infos) { + std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), "; } std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); - std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos; - std::vector<vk::WriteDescriptorSet> write_descriptor_sets; GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); - GGML_ASSERT(buffers.size() == pipeline->parameter_count); - vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; - for (uint32_t i = 0; i < pipeline->parameter_count; i++) { - descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); - } - for (uint32_t i = 0; i < pipeline->parameter_count; i++) { - write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); - } + GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count); - ctx->device->device.updateDescriptorSets(write_descriptor_sets, {}); + vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; + vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; + ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); @@ -2410,7 +2460,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w s.signal_semaphores = std::move(signal_semaphores); } -static void ggml_vk_ctx_end(vk_context * ctx) { +static void ggml_vk_ctx_end(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); if (ctx->s == nullptr) { return; @@ -2420,7 +2470,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) { ctx->s = nullptr; } -static void ggml_vk_ctx_begin(vk_device& device, vk_context * subctx) { +static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) { VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")"); if (subctx->s != nullptr) { ggml_vk_ctx_end(subctx); @@ -2453,13 +2503,13 @@ static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) { } } -static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { +static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); GGML_ASSERT(!ggml_is_contiguous(tensor)); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Check if src is pinned memory vk_buffer buf; @@ -2527,7 +2577,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont staging = ctx->device->sync_staging; staging_offset = 0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2558,12 +2608,12 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont } } -static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Check if src is pinned memory vk_buffer buf = nullptr; @@ -2602,7 +2652,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s staging_buffer = dst->device->sync_staging; staging_offset = 0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2623,7 +2673,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s } } -static void ggml_vk_buffer_write_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, staging_buffer, staging_offset, sync_staging); } @@ -2638,7 +2688,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); } } else { - vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); ggml_vk_ctx_begin(dst->device, subctx); ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, nullptr, 0, true); ggml_vk_ctx_end(subctx); @@ -2650,8 +2700,6 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); dst->device->device.resetFences({ dst->device->fence }); - - delete subctx; } } @@ -2660,12 +2708,14 @@ static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1); } -static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); GGML_ASSERT(width > 0); GGML_ASSERT(height > 0); GGML_ASSERT(src != nullptr); + // TODO: staging_offset is not used + // Check if dst is pinned memory vk_buffer buf = nullptr; size_t buf_offset; @@ -2704,7 +2754,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si staging_buffer = src->device->sync_staging; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2714,18 +2764,18 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys); } -static void ggml_vk_buffer_read_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { +static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) { return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, staging_buffer, staging_offset, sync_staging); } static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) { - VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")"); + VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")"); if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); memcpy(dst, (uint8_t *) src->ptr + offset, size); } else { - vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); ggml_vk_ctx_begin(src->device, subctx); ggml_vk_buffer_read_async(subctx, src, offset, dst, size, nullptr, 0, true); ggml_vk_ctx_end(subctx); @@ -2737,12 +2787,10 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_ for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - - delete subctx; } } -static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { +static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")"); // Make sure both buffers are on same device GGML_ASSERT(src->device == dst->device); @@ -2756,15 +2804,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr if (src->device == dst->device) { VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); // Copy within the device - vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); ggml_vk_ctx_begin(src->device, subctx); ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, src->device->fence); VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); src->device->device.resetFences({ src->device->fence }); - - delete subctx; } else { VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")"); // Copy device to device @@ -2783,7 +2829,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); - vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); ggml_vk_ctx_begin(dst->device, subctx); subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); ggml_vk_ctx_end(subctx); @@ -2791,8 +2837,6 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences"); dst->device->device.resetFences({ dst->device->fence }); - - delete subctx; } static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { @@ -2855,7 +2899,7 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct } static void ggml_vk_matmul( - ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, + ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, @@ -2879,7 +2923,7 @@ static void ggml_vk_matmul( } static void ggml_vk_matmul_id( - ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, + ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, @@ -2913,10 +2957,10 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_ } std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } -static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { +static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); const int tensor_type_size = ggml_type_size(tensor->type); @@ -2934,7 +2978,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 }); } -static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); @@ -3079,7 +3123,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su } else if (qx_needs_dequant) { const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -3107,7 +3151,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su ); // NOLINT } -static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); @@ -3268,11 +3312,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, + { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); } -static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); @@ -3340,10 +3384,10 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c // compute const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } -static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); @@ -3415,10 +3459,11 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con // compute const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } -static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) { ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst); @@ -3431,7 +3476,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, } } -static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { +static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; @@ -3499,7 +3544,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; if (mmp == nullptr) { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Not implemented @@ -3590,7 +3635,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * } else if (qx_needs_dequant) { const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -3618,7 +3664,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * ); // NOLINT } -static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; @@ -3790,11 +3836,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } }, + { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, + vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); } -static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { +static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst); @@ -3803,8 +3850,8 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subct } } -static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - // guaranteed to be an integer due to the check in ggml_can_repeat +static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + VK_LOG_DEBUG("ggml_vk_op_repeat(" << src0 << ", " << src1 << ", " << dst << ")"); const uint64_t ne0 = dst->ne[0]; const uint64_t ne1 = dst->ne[1]; const uint64_t ne2 = dst->ne[2]; @@ -3825,6 +3872,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx const uint64_t nb02 = src0->nb[2]; const uint64_t nb03 = src0->nb[3]; + // guaranteed to be an integer due to the check in ggml_can_repeat const uint64_t nr0 = ne0/ne00; const uint64_t nr1 = ne1/ne01; const uint64_t nr2 = ne2/ne02; @@ -3852,8 +3900,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx for (uint64_t k1 = 0; k1 < ne01; k1++) { for (uint64_t i0 = 0; i0 < nr0; i0++) { copies.push_back({ - src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, - dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, + src_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, + dst_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, ne00*nb0, }); } @@ -3874,11 +3922,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { switch (op) { - case GGML_OP_ADD: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_add_f32; - } - return nullptr; case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); if (dst->type == GGML_TYPE_F16) { @@ -3888,6 +3931,14 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_get_rows_f32[src0->type]; } return nullptr; + case GGML_OP_ADD: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_add_f32; + } + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_add_f16_f32_f16; + } + return nullptr; case GGML_OP_MUL: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_mul_f32; @@ -3898,6 +3949,22 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_div_f32; } return nullptr; + case GGML_OP_CONCAT: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_concat_f32; + } + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_concat_f16; + } + if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { + return ctx->device->pipeline_concat_i32; + } + return nullptr; + case GGML_OP_UPSCALE: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_upscale_f32; + } + return nullptr; case GGML_OP_SCALE: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_scale_f32; @@ -3913,6 +3980,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_clamp_f32; } return nullptr; + case GGML_OP_PAD: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_pad_f32; + } + return nullptr; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: @@ -3922,6 +3994,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_norm_f32; } return nullptr; + case GGML_OP_GROUP_NORM: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_group_norm_f32; + } + return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rms_norm_f32; @@ -3939,11 +4016,21 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_gelu_f32; } break; + case GGML_UNARY_OP_GELU_QUICK: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_gelu_quick_f32; + } + break; case GGML_UNARY_OP_RELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_relu_f32; } break; + case GGML_UNARY_OP_TANH: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_tanh_f32; + } + break; default: break; } @@ -3995,6 +4082,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_sum_rows_f32; } return nullptr; + case GGML_OP_IM2COL: + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_im2col_f32; + } + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + return ctx->device->pipeline_im2col_f32_f16; + } + return nullptr; + case GGML_OP_TIMESTEP_EMBEDDING: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_timestep_embedding_f32; + } + return nullptr; + case GGML_OP_LEAKY_RELU: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_leaky_relu_f32; + } + return nullptr; default: return nullptr; } @@ -4018,9 +4123,12 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: return true; default: return false; @@ -4028,7 +4136,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { } template<typename PC> -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -4078,7 +4186,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c std::cerr << " and " << ggml_type_name(src1->type); } std::cerr << " to " << ggml_type_name(dst->type) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } op_func(ctx, subctx, src0, src1, dst); @@ -4124,7 +4232,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c vk_buffer d_D = extra->buffer_gpu.lock(); // Workaround for tiny tensor inputs on ROPE - if (use_src1 && y_sz > d_D->size) { + if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) { y_sz = VK_WHOLE_SIZE; } @@ -4173,13 +4281,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1); - switch (dst->op) { + switch (op) { case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_SOFT_MAX: case GGML_OP_SUM_ROWS: - elements = { (uint32_t)ggml_nrows(src0), 1, 1 }; - break; + { + const uint32_t nr = ggml_nrows(src0); + if (nr > 262144) { + elements = { 512, 512, CEIL_DIV(nr, 262144) }; + } else if (nr > 512) { + elements = { 512, CEIL_DIV(nr, 512), 1 }; + } else { + elements = { nr, 1, 1 }; + } + } break; + case GGML_OP_GROUP_NORM: + { + const uint32_t num_groups = dst->op_params[0]; + elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 }; + } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_ROPE: elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; @@ -4190,6 +4311,49 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c case GGML_OP_ARGSORT: elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 }; break; + case GGML_OP_IM2COL: + { + const bool is_2D = dst->op_params[6] == 1; + + const uint32_t IC = src1->ne[is_2D ? 2 : 1]; + + const uint32_t KH = is_2D ? src0->ne[1] : 1; + const uint32_t KW = src0->ne[0]; + + const uint32_t OH = is_2D ? dst->ne[2] : 1; + const uint32_t OW = dst->ne[1]; + + const uint32_t batch = src1->ne[3]; + + elements = { OW * KW * KH, OH, batch * IC }; + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + const uint32_t dim = dst->op_params[0]; + uint32_t half_ceil = (dim + 1) / 2; + elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; + } break; + case GGML_OP_ADD: + case GGML_OP_DIV: + case GGML_OP_MUL: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_PAD: + case GGML_OP_CPY: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: + case GGML_OP_UNARY: + { + const uint32_t ne = ggml_nelements(dst); + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } + } break; default: elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; break; @@ -4216,31 +4380,35 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (use_src1) { subbuf_y = { d_Y, y_buf_offset, y_sz }; } else { - subbuf_y = { d_X, 0, d_X->size }; + subbuf_y = { d_X, 0, x_sz }; } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (op == GGML_OP_ROPE) { // Empty src2 is possible in rope, but the shader needs a buffer vk_subbuffer subbuf_z; if (use_src2) { subbuf_z = { d_Z, z_buf_offset, z_sz }; } else { - subbuf_z = { d_X, 0, d_X->size }; + subbuf_z = { d_X, 0, x_sz }; } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } else if (op == GGML_OP_IM2COL) { + // im2col uses only src1 and dst buffers + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src2) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } } else { GGML_ASSERT(op != GGML_OP_SOFT_MAX); @@ -4249,8 +4417,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, ne02 * ne03); - switch (dst->op) { + switch (op) { case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: elements = { (uint32_t)ne01, 1, 1 }; break; @@ -4276,21 +4445,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } } } } } -static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {}); } -static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4301,11 +4470,11 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } -static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4316,11 +4485,11 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } -static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4331,11 +4500,11 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } -static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4346,11 +4515,44 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, cons (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, + 0.0f, 0.0f, 0, }); } -static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + int * op_params = (int *)dst->op_params; + + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONCAT, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, op_params[0], + }); +} + +static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + + const float sf0 = (float)dst->ne[0] / src0->ne[0]; + const float sf1 = (float)dst->ne[1] / src0->ne[1]; + const float sf2 = (float)dst->ne[2] / src0->ne[2]; + const float sf3 = (float)dst->ne[3] / src0->ne[3]; + + ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, { + (uint32_t)ggml_nelements(dst), 0, + (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3], + sf0, sf1, sf2, sf3, + }); +} + +static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4364,7 +4566,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co }); } -static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4377,7 +4579,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons }); } -static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4391,7 +4593,20 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co }); } -static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + }); +} + +static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4406,27 +4621,37 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons }); } -static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + int * op_params = (int *)dst->op_params; + + uint32_t num_groups = op_params[0]; + uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); + static const float eps = 1e-6f; + + ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }); +} + +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); } -static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; float scale = op_params[0]; @@ -4451,7 +4676,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, }); } -static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { +static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { const int n_dims = ((int32_t *) dst->op_params)[1]; // const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -4475,7 +4700,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con }); } -static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; uint32_t ncols = src0->ne[0]; @@ -4494,10 +4719,59 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, }); } -static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }); } +static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int32_t s0 = dst->op_params[0]; + const int32_t s1 = dst->op_params[1]; + const int32_t p0 = dst->op_params[2]; + const int32_t p1 = dst->op_params[3]; + const int32_t d0 = dst->op_params[4]; + const int32_t d1 = dst->op_params[5]; + + const bool is_2D = dst->op_params[6] == 1; + + const uint32_t IC = src1->ne[is_2D ? 2 : 1]; + const uint32_t IH = is_2D ? src1->ne[1] : 1; + const uint32_t IW = src1->ne[0]; + + const uint32_t KH = is_2D ? src0->ne[1] : 1; + const uint32_t KW = src0->ne[0]; + + const uint32_t OH = is_2D ? dst->ne[2] : 1; + const uint32_t OW = dst->ne[1]; + + const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + const uint32_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 + + const uint32_t pelements = OW * KW * KH; + + ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, { + batch_offset, offset_delta, + IC, IW, IH, OW, OH, KW, KH, + pelements, + IC * KH * KW, + s0, s1, p0, p1, d0, d1, + }); +} + +static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t dim = dst->op_params[0]; + const uint32_t max_period = dst->op_params[1]; + const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type); + + ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, { + nb1, dim, max_period, + }); +} + +static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const float * op_params = (const float *)dst->op_params; + ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }); +} + #ifdef GGML_VULKAN_RUN_TESTS static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) { if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) { @@ -4521,7 +4795,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 } else if (type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -4555,7 +4829,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_s; shname = "F16_ALIGNED_S"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (shader_size == 1) { if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { @@ -4571,7 +4845,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_m; shname = "F16_ALIGNED_M"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (shader_size == 2) { if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { @@ -4587,7 +4861,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_l; shname = "F16_ALIGNED_L"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { GGML_ASSERT(0); @@ -4668,7 +4942,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } else if (std::is_same<ggml_fp16_t, X_TYPE>()) { x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } for (size_t i = 0; i < y_ne; i++) { @@ -4679,14 +4953,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch); ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); for (size_t i = 0; i < num_it; i++) { ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( @@ -4727,14 +5001,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } else if (std::is_same<ggml_fp16_t, X_TYPE>()) { src0_type = GGML_TYPE_F16; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (std::is_same<float, Y_TYPE>()) { src1_type = GGML_TYPE_F32; } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) { src1_type = GGML_TYPE_F16; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch); @@ -4841,7 +5115,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1 } else if (tensor->type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -4894,7 +5168,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx->device, subctx); const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); @@ -5027,7 +5301,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); ggml_vk_buffer_write(y_buf, 0, y, y_sz); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); for (size_t i = 0; i < num_it; i++) { ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( @@ -5175,7 +5449,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig; - bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false; + bool mmp = (use_src0 && use_src1 && (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID)) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false; const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig); const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig); @@ -5211,24 +5485,33 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ROPE: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: break; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_TANH: break; default: return; @@ -5236,6 +5519,13 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: + if ( + x_sz > ctx->device->max_memory_allocation_size || + y_sz > ctx->device->max_memory_allocation_size || + d_sz > ctx->device->max_memory_allocation_size || + split_k_size > ctx->device->max_memory_allocation_size) { + GGML_ABORT("Requested preallocation size is too large"); + } if (ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; } @@ -5391,7 +5681,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { std::cerr << std::endl; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { @@ -5430,7 +5720,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } } -static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ +static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node){ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; if (ggml_is_empty(node) || extra == nullptr) { @@ -5457,7 +5747,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_TANH: break; default: return; @@ -5468,13 +5760,17 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: @@ -5483,109 +5779,147 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_MUL_MAT_ID: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: break; default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return; } - if (ctx->compute_ctx == nullptr) { - ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); - ggml_vk_ctx_begin(ctx->device, ctx->compute_ctx); + vk_context compute_ctx; + + if (ctx->compute_ctx.expired()) { + compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ctx->compute_ctx = compute_ctx; + ggml_vk_ctx_begin(ctx->device, compute_ctx); + } else { + compute_ctx = ctx->compute_ctx.lock(); } switch (node->op) { case GGML_OP_REPEAT: - ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_repeat(ctx, compute_ctx, src0, node); break; case GGML_OP_GET_ROWS: - ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ADD: - ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_add(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_MUL: - ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_mul(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_DIV: - ggml_vk_div(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_div(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_CONCAT: + ggml_vk_concat(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_UPSCALE: + ggml_vk_upscale(ctx, compute_ctx, src0, node); break; case GGML_OP_SCALE: - ggml_vk_scale(ctx, ctx->compute_ctx, src0, node); + ggml_vk_scale(ctx, compute_ctx, src0, node); break; case GGML_OP_SQR: - ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node); + ggml_vk_sqr(ctx, compute_ctx, src0, node); break; case GGML_OP_CLAMP: - ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node); + ggml_vk_clamp(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_PAD: + ggml_vk_pad(ctx, compute_ctx, src0, node); break; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node); + ggml_vk_cpy(ctx, compute_ctx, src0, node); break; case GGML_OP_NORM: - ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); + ggml_vk_norm(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_GROUP_NORM: + ggml_vk_group_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_RMS_NORM: - ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node); + ggml_vk_rms_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: - ggml_vk_unary(ctx, ctx->compute_ctx, src0, node); + case GGML_UNARY_OP_TANH: + ggml_vk_unary(ctx, compute_ctx, src0, node); break; default: return; } break; case GGML_OP_DIAG_MASK_INF: - ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node); + ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node); break; case GGML_OP_SOFT_MAX: - ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ROPE: - ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node); + ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node); break; case GGML_OP_ARGSORT: - ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node); + ggml_vk_argsort(ctx, compute_ctx, src0, node); break; case GGML_OP_SUM_ROWS: - ggml_vk_sum_rows(ctx, ctx->compute_ctx, src0, node); + ggml_vk_sum_rows(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_IM2COL: + ggml_vk_im2col(ctx, compute_ctx, src0, src1, node); + + break; + case GGML_OP_TIMESTEP_EMBEDDING: + ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node); + + break; + case GGML_OP_LEAKY_RELU: + ggml_vk_leaky_relu(ctx, compute_ctx, src0, node); break; case GGML_OP_MUL_MAT: - ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_MUL_MAT_ID: - ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, src2, node); + ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node); break; default: return; } - extra->ctx_idx = ctx->compute_ctx->idx; + ctx->tensor_ctxs[node_idx] = compute_ctx; #ifdef GGML_VULKAN_CHECK_RESULTS // Force context reset on each node so that each tensor ends up in its own context @@ -5594,13 +5928,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod #endif if (last_node) { - ggml_vk_ctx_end(ctx->compute_ctx); - ctx->compute_ctx->exit_tensor = node; - ctx->compute_ctx = nullptr; + ggml_vk_ctx_end(compute_ctx); + compute_ctx->exit_tensor_idx = node_idx; + ctx->compute_ctx.reset(); } } -static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){ ggml_tensor_extra_gpu * extra = nullptr; switch (tensor->op) { @@ -5608,13 +5942,17 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_GET_ROWS: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: @@ -5626,6 +5964,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_NONE: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: + case GGML_OP_REPEAT: extra = (ggml_tensor_extra_gpu *) tensor->extra; break; @@ -5633,7 +5975,9 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_TANH: extra = (ggml_tensor_extra_gpu *) tensor->extra; break; default: @@ -5656,31 +6000,31 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(ctx, tensor); + ggml_vk_check_results_0(tensor); #endif - vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; + vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); // Only run if ctx hasn't been submitted yet - if (!subctx.seqs.empty()) { + if (!subctx->seqs.empty()) { // Do staging buffer copies - for (auto& cpy : subctx.in_memcpys) { + for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(&subctx, ctx->fence); + ggml_vk_submit(subctx, ctx->fence); } - if (tensor == subctx.exit_tensor) { + if (tensor_idx == subctx->exit_tensor_idx) { VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); ctx->device->device.resetFences({ ctx->fence }); // Do staging buffer copies - for (auto& cpy : subctx.out_memcpys) { + for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - subctx.in_memcpys.clear(); - subctx.out_memcpys.clear(); + subctx->in_memcpys.clear(); + subctx->out_memcpys.clear(); } return true; @@ -5725,8 +6069,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { ctx->staging_offset = 0; - ctx->compute_ctx = nullptr; - ctx->transfer_ctx = nullptr; + ctx->tensor_ctxs.clear(); ctx->gc.contexts.clear(); } @@ -6063,15 +6406,20 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (ctx->transfer_ctx == nullptr) { + vk_context transfer_ctx; + + if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); - ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx); + transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + ctx->transfer_ctx = transfer_ctx; + ggml_vk_ctx_begin(ctx->device, transfer_ctx); + } else { + transfer_ctx = ctx->transfer_ctx.lock(); } vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_write_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); + ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -6081,15 +6429,20 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (ctx->transfer_ctx == nullptr) { + vk_context transfer_ctx; + + if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); - ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx); + transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + ctx->transfer_ctx = transfer_ctx; + ggml_vk_ctx_begin(ctx->device, transfer_ctx); + } else { + transfer_ctx = ctx->transfer_ctx.lock(); } vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_read_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); + ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset); } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { @@ -6099,16 +6452,21 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - if (ctx->transfer_ctx == nullptr) { + vk_context transfer_ctx; + + if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); - ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx); + transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + ctx->transfer_ctx = transfer_ctx; + ggml_vk_ctx_begin(ctx->device, transfer_ctx); + } else { + transfer_ctx = ctx->transfer_ctx.lock(); } vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); - ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); + ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); return true; } @@ -6118,25 +6476,27 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - if(ctx->transfer_ctx == nullptr) { + if(ctx->transfer_ctx.expired()) { return; } - ggml_vk_ctx_end(ctx->transfer_ctx); + vk_context transfer_ctx = ctx->transfer_ctx.lock(); + + ggml_vk_ctx_end(transfer_ctx); - for (auto& cpy : ctx->transfer_ctx->in_memcpys) { + for (auto& cpy : transfer_ctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(ctx->transfer_ctx, ctx->fence); + ggml_vk_submit(transfer_ctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); ctx->device->device.resetFences({ ctx->fence }); - for (auto& cpy : ctx->transfer_ctx->out_memcpys) { + for (auto& cpy : transfer_ctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ctx->transfer_ctx = nullptr; + ctx->transfer_ctx.reset(); } static bool ggml_vk_is_empty(ggml_tensor * node) { @@ -6159,8 +6519,11 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen last_node -= 1; } + // Reserve tensor context space for all nodes + ctx->tensor_ctxs.resize(cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node); + ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node); } for (int i = 0; i < cgraph->n_nodes; i++) { @@ -6170,13 +6533,17 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen continue; } - bool ok = ggml_vk_compute_forward(ctx, node); + bool ok = ggml_vk_compute_forward(ctx, node, i); if (!ok) { - fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + if (node->op == GGML_OP_UNARY) { + std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl; + } else { + std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; + } } #ifdef GGML_VULKAN_CHECK_RESULTS else { - ggml_vk_check_results_1(ctx, node); + ggml_vk_check_results_1(node); } #endif GGML_ASSERT(ok); @@ -6196,8 +6563,10 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_TANH: return ggml_is_contiguous(op->src[0]); default: return false; @@ -6270,11 +6639,11 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const } return false; } break; - // case GGML_OP_REPEAT: - // { - // ggml_type src0_type = op->src[0]->type; - // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; - // } break; + case GGML_OP_REPEAT: + { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } break; case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_NONE: @@ -6283,18 +6652,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: + case GGML_OP_RMS_NORM: case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: - case GGML_OP_RMS_NORM: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + case GGML_OP_PAD: case GGML_OP_CONT: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: return true; default: return false; @@ -6498,7 +6874,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d } else if (tensor->type == GGML_TYPE_I32) { val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -6509,10 +6885,12 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d } } -static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { +static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; - if (ggml_backend_buffer_is_vk(tensor->buffer)) { + const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer); + + if (is_gpu) { const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -6533,13 +6911,10 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso std::cerr << std::endl << "Result:" << std::endl; ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); std::cerr << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0); - std::cerr << std::endl; std::vector<const ggml_tensor *> done; ggml_vk_print_graph_origin(tensor, done); - if (ggml_backend_buffer_is_vk(tensor->buffer)) { + if (is_gpu) { free(tensor_data); } } @@ -6548,8 +6923,8 @@ void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; -static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor) { - if (tensor->op == GGML_OP_TRANSPOSE) { +static void ggml_vk_check_results_0(ggml_tensor * tensor) { + if (tensor->op == GGML_OP_TRANSPOSE) { return; } @@ -6565,7 +6940,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * ggml_tensor * src2 = tensor->src[2]; struct ggml_init_params iparams = { - /*.mem_size =*/ 1024*1024*1024, + /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; @@ -6620,11 +6995,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(ctx, src0, "src0"); + ggml_vk_print_tensor(src0, "src0"); } } if (src1 != nullptr) { @@ -6662,27 +7037,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(ctx, src1, "src1"); - std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl; - if (src1->src[0] != nullptr) { - std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl; - } - if (src1->src[1] != nullptr) { - std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl; - } - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0); - std::cerr << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0); - std::cerr << std::endl; - std::vector<const ggml_tensor *> done; - ggml_vk_print_graph_origin(src1_clone, done); + ggml_vk_print_tensor(src1, "src1"); } } if (src2 != nullptr) { @@ -6720,27 +7079,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(ctx, src2, "src2"); - std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl; - if (src2->src[0] != nullptr) { - std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl; - } - if (src2->src[1] != nullptr) { - std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl; - } - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0); - std::cerr << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0); - std::cerr << std::endl; - std::vector<const ggml_tensor *> done; - ggml_vk_print_graph_origin(src2_clone, done); + ggml_vk_print_tensor(src2, "src2"); } } @@ -6752,16 +7095,24 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_DIV) { tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone); + } else if (tensor->op == GGML_OP_CONCAT) { + tensor_clone = ggml_concat(ggml_ctx, src0_clone, src1_clone, *(int *)tensor->op_params); + } else if (tensor->op == GGML_OP_UPSCALE) { + tensor_clone = ggml_upscale_ext(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); } else if (tensor->op == GGML_OP_SCALE) { tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]); } else if (tensor->op == GGML_OP_SQR) { tensor_clone = ggml_sqr(ggml_ctx, src0_clone); } else if (tensor->op == GGML_OP_CLAMP) { tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); + } else if (tensor->op == GGML_OP_PAD) { + tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); } else if (tensor->op == GGML_OP_ADD) { tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_NORM) { tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); + } else if (tensor->op == GGML_OP_GROUP_NORM) { + tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params); } else if (tensor->op == GGML_OP_RMS_NORM) { tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_SOFT_MAX) { @@ -6777,12 +7128,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * const int mode = ((int32_t *) tensor->op_params)[2]; //const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3]; const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4]; - float freq_base = ((float *) tensor->op_params)[5]; - float freq_scale = ((float *) tensor->op_params)[6]; - float ext_factor = ((float *) tensor->op_params)[7]; - float attn_factor = ((float *) tensor->op_params)[8]; - float beta_fast = ((float *) tensor->op_params)[9]; - float beta_slow = ((float *) tensor->op_params)[10]; + const float freq_base = ((float *) tensor->op_params)[5]; + const float freq_scale = ((float *) tensor->op_params)[6]; + const float ext_factor = ((float *) tensor->op_params)[7]; + const float attn_factor = ((float *) tensor->op_params)[8]; + const float beta_fast = ((float *) tensor->op_params)[9]; + const float beta_slow = ((float *) tensor->op_params)[10]; tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else if (tensor->op == GGML_OP_UNARY) { switch (ggml_get_unary_op(tensor)) { @@ -6792,12 +7143,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_UNARY_OP_GELU: tensor_clone = ggml_gelu(ggml_ctx, src0_clone); break; + case GGML_UNARY_OP_GELU_QUICK: + tensor_clone = ggml_gelu_quick(ggml_ctx, src0_clone); + break; case GGML_UNARY_OP_RELU: tensor_clone = ggml_relu(ggml_ctx, src0_clone); break; + case GGML_UNARY_OP_TANH: + tensor_clone = ggml_tanh(ggml_ctx, src0_clone); + break; default: std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { @@ -6823,9 +7180,26 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params); } else if (tensor->op == GGML_OP_SUM_ROWS) { tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone); + } else if (tensor->op == GGML_OP_IM2COL) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + + const bool is_2D = tensor->op_params[6] == 1; + tensor_clone = ggml_im2col(ggml_ctx, src0_clone, src1_clone, s0, s1, p0, p1, d0, d1, is_2D, tensor->type); + } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) { + const int32_t dim = tensor->op_params[0]; + const int32_t max_period = tensor->op_params[1]; + tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period); + } else if (tensor->op == GGML_OP_LEAKY_RELU) { + const float * op_params = (const float *)tensor->op_params; + tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false); } else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); @@ -6834,7 +7208,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8); if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone"); + ggml_vk_print_tensor(tensor_clone, "tensor_clone"); } comp_size = ggml_nbytes(tensor_clone); @@ -6851,9 +7225,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * } ggml_free(ggml_ctx); + + VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")"); } -static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor) { +static void ggml_vk_check_results_1(ggml_tensor * tensor) { if (tensor->op == GGML_OP_TRANSPOSE) { return; } @@ -6912,7 +7288,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * } } else { std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) { @@ -6935,7 +7311,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * std::cerr << std::endl; std::vector<const ggml_tensor *> done; ggml_vk_print_graph_origin(tensor, done); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) { first_error[0] = i0; @@ -6977,11 +7353,6 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * std::cerr << std::endl << "Correct:" << std::endl; ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0); std::cerr << std::endl; - std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0); - std::cerr << std::endl << "Correct:" << std::endl; - ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0); - std::cerr << std::endl; std::vector<const ggml_tensor *> done; ggml_vk_print_graph_origin(tensor, done); } @@ -7006,7 +7377,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * std::cerr << std::endl; std::vector<const ggml_tensor *> done; ggml_vk_print_graph_origin(tensor, done); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl; } @@ -7018,5 +7389,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * if (ggml_backend_buffer_is_vk(tensor->buffer)) { free(tensor_data); } + + VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); } #endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b5fdb96d..73054bfe 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -47,6 +47,9 @@ #include <unistd.h> #endif +#if defined(__ARM_FEATURE_SVE) +int ggml_sve_cnt_b = 0; +#endif #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -63,6 +66,9 @@ // disable POSIX deprecation warnings // these functions are never going away, anyway #pragma warning(disable: 4996) + +// unreachable code because of multiple instances of code after GGML_ABORT +#pragma warning(disable: 4702) #endif #if defined(_WIN32) @@ -151,23 +157,69 @@ typedef pthread_t ggml_thread_t; #include <sys/wait.h> -void ggml_print_backtrace(void) { - /* - #include <execinfo.h> - #include <dlfcn.h> +#if defined(__ANDROID__) +#include <unwind.h> +#include <dlfcn.h> +#include <stdio.h> - void * trace[100]; +struct backtrace_state { + void ** current; + void ** end; +}; - int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0])); +static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) { + struct backtrace_state * state = (struct backtrace_state *)arg; + uintptr_t pc = _Unwind_GetIP(context); + if (pc) { + if (state->current == state->end) { + return _URC_END_OF_STACK; + } else { + *state->current++ = (void*)pc; + } + } + return _URC_NO_REASON; +} + +static void ggml_print_backtrace_symbols(void) { + const int max = 100; + void* buffer[max]; + struct backtrace_state state = {buffer, buffer + max}; + _Unwind_Backtrace(unwind_callback, &state); + + int count = state.current - buffer; + + for (int idx = 0; idx < count; ++idx) { + const void * addr = buffer[idx]; + const char * symbol = ""; + + Dl_info info; + if (dladdr(addr, &info) && info.dli_sname) { + symbol = info.dli_sname; + } + + fprintf(stderr, "%d: %p %s\n", idx, addr, symbol); + } +} +#elif defined(__linux__) && defined(__GLIBC__) +#include <execinfo.h> +static void ggml_print_backtrace_symbols(void) { + void * trace[100]; + int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0])); backtrace_symbols_fd(trace, nptrs, STDERR_FILENO); - */ +} +#else +static void ggml_print_backtrace_symbols(void) { + // platform not supported +} +#endif - // backtrack_symbols does not show line numbers, use gdb instead +static void ggml_print_backtrace(void) { char attach[32]; snprintf(attach, sizeof(attach), "attach %d", getpid()); int pid = fork(); if (pid == 0) { + // try gdb execlp("gdb", "gdb", "--batch", "-ex", "set style enabled on", "-ex", attach, @@ -175,16 +227,46 @@ void ggml_print_backtrace(void) { "-ex", "detach", "-ex", "quit", (char *) NULL); + // try lldb + execlp("lldb", "lldb", "--batch", + "-o", "bt", + "-o", "quit", + "-p", attach, + (char *) NULL); + exit(EXIT_FAILURE); } else { - waitpid(pid, NULL, 0); + int wstatus; + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus)) { + if (WEXITSTATUS(wstatus) == EXIT_FAILURE) { + // gdb failed, fallback to backtrace_symbols + ggml_print_backtrace_symbols(); + } + } } } #else -void ggml_print_backtrace(void) { +static void ggml_print_backtrace(void) { // platform not supported } #endif +void ggml_abort(const char * file, int line, const char * fmt, ...) { + fflush(stdout); + + fprintf(stderr, "%s:%d: ", file, line); + + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + + fprintf(stderr, "\n"); + + ggml_print_backtrace(); + abort(); +} + #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -256,7 +338,7 @@ inline static void * ggml_aligned_malloc(size_t size) { break; } GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return NULL; } return aligned_memory; @@ -277,7 +359,7 @@ inline static void * ggml_malloc(size_t size) { void * result = malloc(size); if (result == NULL) { GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } return result; } @@ -291,7 +373,7 @@ inline static void * ggml_calloc(size_t num, size_t size) { void * result = calloc(num, size); if (result == NULL) { GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } return result; } @@ -414,9 +496,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { } } +void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { + for (int i = 0; i < n; i++) { + y[i] = ggml_compute_fp32_to_bf16(x[i]); + } +} + void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) { int i = 0; #if defined(__AVX512BF16__) + // subnormals are flushed to zero on this platform for (; i + 32 <= n; i += 32) { _mm512_storeu_si512( (__m512i *)(y + i), @@ -939,7 +1028,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, - .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -2339,7 +2428,7 @@ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } //inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } -inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } @@ -3544,7 +3633,7 @@ static inline int ggml_up(int n, int m) { } // assert that pointer is aligned to GGML_MEM_ALIGN -#define ggml_assert_aligned(ptr) \ +#define GGML_ASSERT_ALIGNED(ptr) \ GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) //////////////////////////////////////////////////////////////////////////////// @@ -3645,7 +3734,13 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT(ctx->mem_buffer != NULL); - ggml_assert_aligned(ctx->mem_buffer); + GGML_ASSERT_ALIGNED(ctx->mem_buffer); + +#if defined(__ARM_FEATURE_SVE) + if (!ggml_sve_cnt_b) { + ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); + } +#endif GGML_PRINT_DEBUG("%s: context initialized\n", __func__); @@ -3777,7 +3872,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml .type = type, }; - ggml_assert_aligned(mem_buffer + obj_new->offs); + GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs); if (obj_cur != NULL) { obj_cur->next = obj_new; @@ -3801,7 +3896,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( struct ggml_tensor * view_src, size_t view_offs) { - assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); + GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT); + GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); // find the base tensor and absolute offset if (view_src != NULL && view_src->view_src != NULL) { @@ -3878,7 +3974,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( #endif // TODO: this should not be needed as long as we don't rely on aligned SIMD loads - //ggml_assert_aligned(result->data); + //GGML_ASSERT_ALIGNED(result->data); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; @@ -4051,8 +4147,8 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } return tensor; @@ -4110,8 +4206,8 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } return tensor; @@ -4180,11 +4276,9 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { } default: { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } - - return 0.0f; } void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { @@ -4227,8 +4321,8 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4248,10 +4342,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i case GGML_TYPE_F32: return ((float *) data)[0]; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - - return 0.0f; } void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) { @@ -4283,8 +4375,8 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4321,11 +4413,9 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { } default: { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } - - return 0.0f; } void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { @@ -4362,8 +4452,8 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4383,10 +4473,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, case GGML_TYPE_F32: return ((float *) data)[0]; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - - return 0.0f; } void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) { @@ -4418,8 +4506,8 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4442,8 +4530,11 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) { } struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) { - strncpy(tensor->name, name, sizeof(tensor->name) - 1); - tensor->name[sizeof(tensor->name) - 1] = '\0'; + size_t i; + for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) { + tensor->name[i] = name[i]; + } + tensor->name[i] = '\0'; return tensor; } @@ -5014,7 +5105,7 @@ struct ggml_tensor * ggml_mean( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -5037,7 +5128,7 @@ struct ggml_tensor * ggml_argmax( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); is_node = true; } @@ -5360,7 +5451,7 @@ static struct ggml_tensor * ggml_norm_impl( bool is_node = false; if (!inplace && (a->grad)) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -5459,17 +5550,19 @@ static struct ggml_tensor * ggml_group_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, int n_groups, + float eps, bool inplace) { bool is_node = false; if (!inplace && (a->grad)) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op_params[0] = n_groups; + ggml_set_op_params_i32(result, 0, n_groups); + ggml_set_op_params_f32(result, 1, eps); result->op = GGML_OP_GROUP_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5481,15 +5574,17 @@ static struct ggml_tensor * ggml_group_norm_impl( struct ggml_tensor * ggml_group_norm( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups) { - return ggml_group_norm_impl(ctx, a, n_groups, false); + int n_groups, + float eps) { + return ggml_group_norm_impl(ctx, a, n_groups, eps, false); } struct ggml_tensor * ggml_group_norm_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - int n_groups) { - return ggml_group_norm_impl(ctx, a, n_groups, true); + int n_groups, + float eps) { + return ggml_group_norm_impl(ctx, a, n_groups, eps, true); } // ggml_mul_mat @@ -5877,7 +5972,7 @@ struct ggml_tensor * ggml_reshape( if (b->grad) { // gradient propagation is not supported - //GGML_ASSERT(false); + //GGML_ABORT("fatal error"); } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0); @@ -6660,7 +6755,7 @@ struct ggml_tensor * ggml_clamp( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6736,7 +6831,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6808,7 +6903,7 @@ struct ggml_tensor * ggml_im2col( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6894,7 +6989,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6935,7 +7030,7 @@ struct ggml_tensor * ggml_pool_1d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6973,7 +7068,7 @@ struct ggml_tensor * ggml_pool_2d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7006,7 +7101,7 @@ static struct ggml_tensor * ggml_upscale_impl( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7056,7 +7151,7 @@ struct ggml_tensor * ggml_pad( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7105,7 +7200,7 @@ struct ggml_tensor * ggml_timestep_embedding( bool is_node = false; if (timesteps->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7231,7 +7326,7 @@ struct ggml_tensor * ggml_flash_attn_back( struct ggml_tensor * v, struct ggml_tensor * d, bool masked) { - GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes"); + GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes"); GGML_ASSERT(ggml_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) @@ -7330,7 +7425,7 @@ struct ggml_tensor * ggml_ssm_conv( bool is_node = false; if (s->grad || x->grad || c->grad || sq->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -7384,7 +7479,7 @@ struct ggml_tensor * ggml_ssm_scan( bool is_node = false; if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -7416,7 +7511,7 @@ struct ggml_tensor * ggml_win_part( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7454,7 +7549,7 @@ struct ggml_tensor * ggml_win_unpart( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7484,7 +7579,7 @@ struct ggml_tensor * ggml_get_rel_pos( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -8174,7 +8269,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8216,7 +8311,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } return; @@ -8333,7 +8428,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -8460,7 +8555,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8520,7 +8615,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } return; @@ -8689,7 +8784,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -8775,7 +8870,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8835,7 +8930,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -9006,7 +9101,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -9184,8 +9279,8 @@ static void ggml_compute_forward_dup( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -9337,7 +9432,7 @@ static void ggml_compute_forward_add_f16_f32( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9412,7 +9507,7 @@ static void ggml_compute_forward_add_bf16_f32( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9464,7 +9559,7 @@ static void ggml_compute_forward_add_f16_f16( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9516,7 +9611,7 @@ static void ggml_compute_forward_add_bf16_bf16( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9610,7 +9705,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_F16: @@ -9622,7 +9717,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_f16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_BF16: @@ -9634,7 +9729,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_bf16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_Q4_0: @@ -9672,8 +9767,8 @@ static void ggml_compute_forward_add( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10007,7 +10102,7 @@ static void ggml_compute_forward_add1( ggml_compute_forward_add1_f16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_BF16: @@ -10019,7 +10114,7 @@ static void ggml_compute_forward_add1( ggml_compute_forward_add1_bf16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_Q4_0: @@ -10058,8 +10153,8 @@ static void ggml_compute_forward_add1( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10191,8 +10286,8 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q4_0_8_8: default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10272,8 +10367,8 @@ static void ggml_compute_forward_sub( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10383,8 +10478,8 @@ static void ggml_compute_forward_mul( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10474,8 +10569,8 @@ static void ggml_compute_forward_div( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10519,8 +10614,8 @@ static void ggml_compute_forward_sqr( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10564,8 +10659,8 @@ static void ggml_compute_forward_sqrt( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10609,8 +10704,8 @@ static void ggml_compute_forward_log( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10738,8 +10833,8 @@ static void ggml_compute_forward_sum( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10791,8 +10886,8 @@ static void ggml_compute_forward_sum_rows( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10848,8 +10943,8 @@ static void ggml_compute_forward_mean( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10896,8 +10991,8 @@ static void ggml_compute_forward_argmax( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11014,8 +11109,8 @@ static void ggml_compute_forward_repeat( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11092,8 +11187,8 @@ static void ggml_compute_forward_repeat_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11161,8 +11256,8 @@ static void ggml_compute_forward_concat( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11205,8 +11300,8 @@ static void ggml_compute_forward_abs( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11249,8 +11344,8 @@ static void ggml_compute_forward_sgn( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11293,8 +11388,8 @@ static void ggml_compute_forward_neg( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11337,8 +11432,8 @@ static void ggml_compute_forward_step( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11380,8 +11475,8 @@ static void ggml_compute_forward_tanh( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11424,8 +11519,8 @@ static void ggml_compute_forward_elu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11468,8 +11563,8 @@ static void ggml_compute_forward_relu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11512,8 +11607,8 @@ static void ggml_compute_forward_sigmoid( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11571,8 +11666,8 @@ static void ggml_compute_forward_gelu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11630,8 +11725,8 @@ static void ggml_compute_forward_gelu_quick( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11689,8 +11784,8 @@ static void ggml_compute_forward_silu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } // ggml_compute_forward_leaky_relu @@ -11738,8 +11833,8 @@ static void ggml_compute_forward_leaky_relu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11801,8 +11896,8 @@ static void ggml_compute_forward_silu_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11843,8 +11938,8 @@ static void ggml_compute_forward_hardswish( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11885,8 +11980,8 @@ static void ggml_compute_forward_hardsigmoid( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11957,8 +12052,8 @@ static void ggml_compute_forward_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12025,8 +12120,8 @@ static void ggml_compute_forward_rms_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12198,8 +12293,8 @@ static void ggml_compute_forward_rms_norm_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12220,10 +12315,11 @@ static void ggml_compute_forward_group_norm_f32( GGML_TENSOR_UNARY_OP_LOCALS - const float eps = 1e-6f; // TODO: make this a parameter - // TODO: optimize + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + int n_channels = src0->ne[2]; int n_groups = dst->op_params[0]; int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; @@ -12292,8 +12388,8 @@ static void ggml_compute_forward_group_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12544,6 +12640,8 @@ UseGgmlGemm1:; IQK_MulMat_Not_Available2:; #endif + ggml_barrier(params->shared); + #if GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { const size_t row_size = ggml_row_size(vec_dot_type, ne10); @@ -12802,6 +12900,34 @@ IQK_MulMat_Not_Available:; continue; } + if (((ggml_n_dims(src0) - 1) == 2) && gemv) { + int64_t src0_cur_start = (ith * ne01) / nth; + int64_t src0_cur_end = ((ith + 1) * ne01) / nth; + src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start; + src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end; + if (src0_cur_start >= src0_cur_end) return; + + for (int ir1 = 0; ir1 < nr1; ir1++) { + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); + const int id = row_mapping.i1; // selected expert index + + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 + + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row + + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12)); + + gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, + (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start); + } + continue; + } + // distribute the thread work across the inner or outer loop based on which one is larger const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows @@ -13119,17 +13245,17 @@ static void ggml_compute_forward_out_prod( } break; case GGML_TYPE_F16: { - GGML_ASSERT(false); // todo + GGML_ABORT("fatal error"); // todo // ggml_compute_forward_out_prod_f16_f32(params, dst); - } break; + } case GGML_TYPE_F32: { ggml_compute_forward_out_prod_f32(params, dst); } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13188,8 +13314,8 @@ static void ggml_compute_forward_scale( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13312,8 +13438,8 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q4_0_8_8: default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13598,8 +13724,8 @@ static void ggml_compute_forward_get_rows( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } //static bool first = true; @@ -13706,8 +13832,8 @@ static void ggml_compute_forward_get_rows_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } //static bool first = true; @@ -13784,8 +13910,8 @@ static void ggml_compute_forward_diag( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13854,8 +13980,8 @@ static void ggml_compute_forward_diag_mask_inf( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13872,8 +13998,8 @@ static void ggml_compute_forward_diag_mask_zero( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13990,8 +14116,8 @@ static void ggml_compute_forward_soft_max( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14086,8 +14212,8 @@ static void ggml_compute_forward_soft_max_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14186,8 +14312,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_F64: case GGML_TYPE_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14516,8 +14642,8 @@ static void ggml_compute_forward_rope( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14540,8 +14666,8 @@ static void ggml_compute_forward_rope_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14740,8 +14866,8 @@ static void ggml_compute_forward_conv_transpose_1d( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14912,8 +15038,8 @@ static void ggml_compute_forward_im2col( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15024,7 +15150,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( const struct ggml_tensor * src = dst->src[0]; - assert(src->type == GGML_TYPE_F32); + assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); if (params->ith != 0) { return; @@ -15037,28 +15163,27 @@ static void ggml_compute_forward_pool_1d_sk_p0( const int64_t rs = dst->ne[0]; while (cdata < data_end) { - const float * const srow = (const float *)cdata; - + const void * srow = (const void *)cdata; int j = 0; - for (int64_t i = 0; i < rs; ++i) { switch (op) { case GGML_OP_POOL_AVG: drow[i] = 0; break; case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { - case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; - case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_AVG: drow[i] += srow_j; break; + case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } ++j; } switch (op) { case GGML_OP_POOL_AVG: drow[i] /= k; break; case GGML_OP_POOL_MAX: break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } @@ -15092,7 +15217,7 @@ static void ggml_compute_forward_pool_2d( const struct ggml_tensor * src = dst->src[0]; - GGML_ASSERT(src->type == GGML_TYPE_F32); + assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); if (params->ith != 0) { return; @@ -15127,7 +15252,7 @@ static void ggml_compute_forward_pool_2d( switch (op) { case GGML_OP_POOL_AVG: *out = 0; break; case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } const int ix = offset0 + ox * s0; @@ -15135,21 +15260,22 @@ static void ggml_compute_forward_pool_2d( for (int ky = 0; ky < k1; ++ky) { if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; - const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky)); for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; if (j < 0 || j >= src->ne[0]) continue; + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { - case GGML_OP_POOL_AVG: *out += srow[j]; break; - case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_AVG: *out += srow_j; break; + case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } switch (op) { case GGML_OP_POOL_AVG: *out /= ka; break; case GGML_OP_POOL_MAX: break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } @@ -15213,8 +15339,8 @@ static void ggml_compute_forward_upscale( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15271,8 +15397,8 @@ static void ggml_compute_forward_pad( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15312,8 +15438,8 @@ static void ggml_compute_forward_arange( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15363,8 +15489,8 @@ static void ggml_compute_forward_timestep_embedding( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15422,8 +15548,8 @@ static void ggml_compute_forward_argsort( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15645,8 +15771,8 @@ static void ggml_compute_forward_flash_attn_ext( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15981,8 +16107,8 @@ static void ggml_compute_forward_flash_attn_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16103,8 +16229,8 @@ static void ggml_compute_forward_ssm_conv( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16224,8 +16350,8 @@ static void ggml_compute_forward_ssm_scan( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16287,8 +16413,8 @@ static void ggml_compute_forward_win_part( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16348,8 +16474,8 @@ static void ggml_compute_forward_win_unpart( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16416,8 +16542,8 @@ static void ggml_compute_forward_unary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16463,8 +16589,8 @@ static void ggml_compute_forward_get_rel_pos( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16544,8 +16670,8 @@ static void ggml_compute_forward_add_rel_pos( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16590,8 +16716,8 @@ static void ggml_compute_forward_map_unary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16639,8 +16765,8 @@ static void ggml_compute_forward_map_binary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16838,8 +16964,8 @@ static void ggml_compute_forward_cross_entropy_loss( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16925,8 +17051,8 @@ static void ggml_compute_forward_cross_entropy_loss_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -17261,14 +17387,32 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } //////////////////////////////////////////////////////////////////////////////// -static size_t ggml_hash_size(size_t min_sz) { +struct ggml_hash_set ggml_hash_set_new(size_t size) { + size = ggml_hash_size(size); + struct ggml_hash_set result; + result.size = size; + result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size); + result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t)); + return result; +} + +void ggml_hash_set_reset(struct ggml_hash_set * hash_set) { + memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size)); +} + +void ggml_hash_set_free(struct ggml_hash_set * hash_set) { + GGML_FREE(hash_set->used); + GGML_FREE(hash_set->keys); +} + +size_t ggml_hash_size(size_t min_sz) { // next primes after powers of two static const size_t primes[] = { 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031, @@ -17279,7 +17423,7 @@ static size_t ggml_hash_size(size_t min_sz) { }; static const size_t n_primes = sizeof(primes)/sizeof(primes[0]); - // find the smallest prime that is larger or equal to min_sz + // find the smallest prime that is larger or equal than min_sz size_t l = 0; size_t r = n_primes; while (l < r) { @@ -17294,67 +17438,6 @@ static size_t ggml_hash_size(size_t min_sz) { return sz; } -static size_t ggml_hash(const void * p) { - return (size_t)p; -} - -size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t h = ggml_hash(key) % hash_set.size; - - // linear probing - size_t i = h; - while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) { - i = (i + 1) % hash_set.size; - if (i == h) { - // visited all hash table entries -> not found - return GGML_HASHTABLE_FULL; - } - } - return i; -} - -bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key; -} - -size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - - GGML_ASSERT(i != GGML_HASHTABLE_FULL); - - if (hash_set.keys[i] == key) { - return GGML_HASHTABLE_ALREADY_EXISTS; - } - - // insert - GGML_ASSERT(hash_set.keys[i] == NULL); - hash_set.keys[i] = key; - return i; -} - -size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - - GGML_ASSERT(i != GGML_HASHTABLE_FULL); - - hash_set.keys[i] = key; - return i; -} - -struct ggml_hash_set ggml_hash_set_new(size_t size) { - size = ggml_hash_size(size); - struct ggml_hash_set result; - result.size = size; - result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size); - memset(result.keys, 0, sizeof(struct ggml_tensor *) * size); - return result; -} - -static void ggml_hash_set_free(struct ggml_hash_set hash_set) { - GGML_FREE(hash_set.keys); -} - struct hash_map { struct ggml_hash_set set; struct ggml_tensor ** vals; @@ -17363,13 +17446,12 @@ struct hash_map { static struct hash_map * ggml_new_hash_map(size_t size) { struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map)); result->set = ggml_hash_set_new(size); - result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size); - memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size); + result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *)); return result; } static void ggml_hash_map_free(struct hash_map * map) { - ggml_hash_set_free(map->set); + ggml_hash_set_free(&map->set); GGML_FREE(map->vals); GGML_FREE(map); } @@ -17390,7 +17472,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( return node; } - if (!ggml_hash_contains(graph->visited_hash_table, node)) { + if (!ggml_hash_contains(&graph->visited_hash_set, node)) { return node; } @@ -17405,8 +17487,8 @@ static struct ggml_tensor * ggml_recompute_graph_node( return node; } - size_t i = ggml_hash_find(replacements->set, node); - GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full + size_t i = ggml_hash_find(&replacements->set, node); + GGML_ASSERT(i != GGML_HASHSET_FULL); // assert that not full if (replacements->set.keys[i] == node) { return replacements->vals[i]; } @@ -17464,8 +17546,8 @@ void ggml_build_backward_gradient_checkpointing( // insert checkpoints in replacements for (int i = 0; i < n_checkpoints; ++i) { - size_t k = ggml_hash_find(replacements->set, checkpoints[i]); - GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full + size_t k = ggml_hash_find(&replacements->set, checkpoints[i]); + GGML_ASSERT(k != GGML_HASHSET_FULL); // assert that not full GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite replacements->set.keys[k] = checkpoints[i]; replacements->vals[k] = checkpoints[i]; @@ -17493,7 +17575,7 @@ void ggml_build_backward_gradient_checkpointing( // functions to change gradients considering the case that input a might be initial gradient with zero value -static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return b; } else { @@ -17501,7 +17583,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg } } -static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); @@ -17510,7 +17592,7 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg } } -static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return ggml_repeat(ctx, b, a); } else { @@ -17518,7 +17600,7 @@ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct g } } -static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return ggml_neg(ctx, b); } else { @@ -17526,7 +17608,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg } } -static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) { +static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table) { struct ggml_tensor * src0 = tensor->src[0]; struct ggml_tensor * src1 = tensor->src[1]; struct ggml_tensor * src2 = tensor->src[2]; @@ -17695,8 +17777,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_MEAN: case GGML_OP_ARGMAX: { - GGML_ASSERT(false); // TODO: implement - } break; + GGML_ABORT("fatal error"); // TODO: implement + } case GGML_OP_REPEAT: { // necessary for llama @@ -17719,16 +17801,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CONCAT: { - GGML_ASSERT(false); // TODO: implement - } break; + GGML_ABORT("fatal error"); // TODO: implement + } case GGML_OP_SILU_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_NORM: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_RMS_NORM: { // necessary for llama @@ -17744,12 +17826,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_RMS_NORM_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_GROUP_NORM: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_MUL_MAT: { // https://cs231n.github.io/optimization-2/#staged @@ -17810,12 +17892,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_MUL_MAT_ID: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_OUT_PROD: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_SCALE: { // necessary for llama @@ -17991,12 +18073,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_GET_ROWS_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_DIAG: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_DIAG_MASK_INF: { // necessary for llama @@ -18034,8 +18116,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_SOFT_MAX_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ROPE: { // necessary for llama @@ -18110,52 +18192,52 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CLAMP: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_CONV_TRANSPOSE_1D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_IM2COL: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_CONV_TRANSPOSE_2D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_POOL_1D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_POOL_2D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_UPSCALE: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_PAD: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ARANGE: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_TIMESTEP_EMBEDDING: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ARGSORT: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_LEAKY_RELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_FLASH_ATTN_EXT: { struct ggml_tensor * flash_grad = NULL; @@ -18211,13 +18293,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_FLASH_ATTN_BACK: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: case GGML_OP_UNARY: @@ -18255,12 +18337,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_UNARY_OP_TANH: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_ELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_RELU: { if (src0->grad) { @@ -18274,16 +18356,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_UNARY_OP_SIGMOID: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_GELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_GELU_QUICK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_SILU: { // necessary for llama @@ -18295,7 +18377,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_GET_REL_POS: @@ -18309,8 +18391,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM3: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_CROSS_ENTROPY_LOSS: { if (src0->grad) { @@ -18325,16 +18407,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_NONE: { // nop } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } for (int i = 0; i < GGML_MAX_SRC; ++i) { @@ -18354,7 +18436,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } // check if already visited - if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) { + if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) { return; } @@ -18400,7 +18482,6 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten } const int n0 = cgraph->n_nodes; - UNUSED(n0); ggml_visit_parents(cgraph, tensor); @@ -18436,7 +18517,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size); for (int i = 0; i < gf->n_nodes; i++) { if (gf->grads[i]) { - ggml_hash_insert(zero_table, gf->grads[i]); + ggml_hash_insert(&zero_table, gf->grads[i]); } } @@ -18446,7 +18527,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * // inplace operations to add gradients are not created by ggml_compute_backward // use allocator to automatically make inplace operations if (node->grad) { - ggml_compute_backward(ctx, node, zero_table); + ggml_compute_backward(ctx, node, &zero_table); } } @@ -18459,16 +18540,29 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * } } - ggml_hash_set_free(zero_table); + ggml_hash_set_free(&zero_table); +} + +static void * incr_ptr_aligned(void ** p, size_t size, size_t align) { + void * ptr = *p; + ptr = (void *) GGML_PAD((uintptr_t) ptr, align); + *p = (void *) ((char *) ptr + size); + return ptr; } static size_t ggml_graph_nbytes(size_t size, bool grads) { - size_t nbytes = sizeof(struct ggml_cgraph); - nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes + size_t hash_size = ggml_hash_size(size * 2); + void * p = 0; + incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1); + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs + incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys if (grads) { - nbytes += size * sizeof(struct ggml_tensor *); // grads + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads } - nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set + incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t)); + + size_t nbytes = (size_t) p; return nbytes; } @@ -18485,19 +18579,19 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size); struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); - struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1); - + // the size of the hash table is doubled since it needs to hold both nodes and leafs size_t hash_size = ggml_hash_size(size * 2); - struct ggml_tensor ** nodes_ptr = data_start; - struct ggml_tensor ** leafs_ptr = nodes_ptr + size; - struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size; - struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL; - // check that we allocated the correct amount of memory - assert(obj_size == (size_t) ( - (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph)); + void * p = cgraph + 1; - memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *)); + struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; + ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t)); + + // check that we allocated the correct amount of memory + assert(obj_size == (size_t)((char *)p - (char *)cgraph)); *cgraph = (struct ggml_cgraph) { /*.size =*/ size, @@ -18506,10 +18600,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.nodes =*/ nodes_ptr, /*.grads =*/ grads_ptr, /*.leafs =*/ leafs_ptr, - /*.hash_table =*/ { hash_size, hash_keys_ptr }, + /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, }; + ggml_hash_set_reset(&cgraph->visited_hash_set); + return cgraph; } @@ -18525,7 +18621,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.nodes =*/ cgraph0->nodes + i0, /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, /*.leafs =*/ NULL, - /*.hash_table =*/ { 0, NULL }, + /*.hash_table =*/ { 0, NULL, NULL }, /*.order =*/ cgraph0->order, }; @@ -18535,7 +18631,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { GGML_ASSERT(dst->size >= src->n_leafs); GGML_ASSERT(dst->size >= src->n_nodes); - GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size); + GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size); dst->n_leafs = src->n_leafs; dst->n_nodes = src->n_nodes; @@ -18556,9 +18652,9 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } } - for (size_t i = 0; i < src->visited_hash_table.size; ++i) { - if (src->visited_hash_table.keys[i]) { - ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]); + for (size_t i = 0; i < src->visited_hash_set.size; ++i) { + if (src->visited_hash_set.keys[i]) { + ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); } } } @@ -18584,7 +18680,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { void ggml_graph_clear(struct ggml_cgraph * cgraph) { cgraph->n_leafs = 0; cgraph->n_nodes = 0; - memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *)); + ggml_hash_set_reset(&cgraph->visited_hash_set); } // @@ -18779,7 +18875,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { n_tasks = MIN(ggml_nrows(node), n_threads); } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } break; case GGML_OP_SILU_BACK: @@ -18906,8 +19002,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } default: { fprintf(stderr, "%s: op not implemented: ", __func__); @@ -18916,8 +19012,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } else { fprintf(stderr, "%d\n", node->op); } - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } assert(n_tasks > 0); @@ -19027,7 +19123,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa cur += sizeof(float)*ne00*ne01*ne02; cur += sizeof(float)*ne10*ne11; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_CONV_TRANSPOSE_2D: @@ -19073,8 +19169,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } default: break; } @@ -20308,9 +20404,9 @@ static enum ggml_opt_result linesearch_backtracking( (*step) *= width; } - GGML_ASSERT(false && "line search failed"); + GGML_ABORT("line search failed"); - return GGML_LINESEARCH_FAIL; + //return GGML_LINESEARCH_FAIL; } static enum ggml_opt_result ggml_opt_lbfgs( @@ -20578,9 +20674,9 @@ static enum ggml_opt_result ggml_opt_lbfgs( step[0] = 1.0; } - GGML_ASSERT(false && "lbfgs failed"); + GGML_ABORT("lbfgs failed"); - return GGML_OPT_RESULT_DID_NOT_CONVERGE; + //return GGML_OPT_RESULT_DID_NOT_CONVERGE; } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { @@ -20925,7 +21021,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_BF16: { size_t elemsize = sizeof(ggml_bf16_t); - ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n); + ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n); result = n * elemsize; } break; case GGML_TYPE_F32: @@ -21283,10 +21379,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } } break; case GGUF_TYPE_ARRAY: - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } break; - default: GGML_ASSERT(false && "invalid type"); + default: GGML_ABORT("invalid type"); } if (!ok) { @@ -21453,7 +21549,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ctx->infos[i].ne[3], }; - struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); + int n_dims = ctx->infos[i].n_dims; + if (n_dims == 0 || n_dims > 4) { + n_dims = 4; + for (; n_dims > 1; --n_dims) if (ne[n_dims-1] > 1) break; + } + struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, n_dims, ne); ok = ok && cur != NULL; @@ -21867,12 +21968,12 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); GGML_FREE((void *)data); } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { - GGML_ASSERT(false && "nested arrays not supported"); + GGML_ABORT("nested arrays not supported"); } else { gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); } } break; - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } } @@ -21881,7 +21982,7 @@ void gguf_add_tensor( struct gguf_context * ctx, const struct ggml_tensor * tensor) { if (gguf_find_tensor(ctx, tensor->name) != -1) { - GGML_ASSERT(false && "duplicated tensor name"); + GGML_ABORT("duplicated tensor name"); } const int idx = ctx->header.n_tensors; @@ -21914,7 +22015,7 @@ void gguf_add_tensor( void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { const int idx = gguf_find_tensor(ctx, name); if (idx < 0) { - GGML_ASSERT(false && "tensor not found"); + GGML_ABORT("tensor not found"); } ctx->infos[idx].type = type; @@ -21923,7 +22024,7 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { const int idx = gguf_find_tensor(ctx, name); if (idx < 0) { - GGML_ASSERT(false && "tensor not found"); + GGML_ABORT("tensor not found"); } ctx->infos[idx].data = data; @@ -22052,10 +22153,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } } break; case GGUF_TYPE_ARRAY: - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } break; - default: GGML_ASSERT(false && "invalid type"); + default: GGML_ABORT("invalid type"); } } @@ -22116,7 +22217,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { FILE * file = ggml_fopen(fname, "wb"); if (!file) { - GGML_ASSERT(false && "failed to open file for writing"); + GGML_ABORT("failed to open file for writing"); } struct gguf_buf buf = gguf_buf_init(16*1024); diff --git a/ggml/src/vulkan-shaders/CMakeLists.txt b/ggml/src/vulkan-shaders/CMakeLists.txt index 41551e00..10075db3 100644 --- a/ggml/src/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/vulkan-shaders/CMakeLists.txt @@ -1,5 +1,7 @@ +find_package (Threads REQUIRED) set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) diff --git a/ggml/src/vulkan-shaders/add.comp b/ggml/src/vulkan-shaders/add.comp index 8475b011..3974845d 100644 --- a/ggml/src/vulkan-shaders/add.comp +++ b/ggml/src/vulkan-shaders/add.comp @@ -4,9 +4,11 @@ #include "generic_binary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)])); } diff --git a/ggml/src/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp index ca272e22..7071302a 100644 --- a/ggml/src/vulkan-shaders/clamp.comp +++ b/ggml/src/vulkan-shaders/clamp.comp @@ -4,10 +4,12 @@ #include "generic_unary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); + const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); } diff --git a/ggml/src/vulkan-shaders/concat.comp b/ggml/src/vulkan-shaders/concat.comp new file mode 100644 index 00000000..08ab5514 --- /dev/null +++ b/ggml/src/vulkan-shaders/concat.comp @@ -0,0 +1,35 @@ +#version 450 + +#include "types.comp" +#include "generic_binary_head.comp" + +void main() { + const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + const int dim = p.param3; + + if (idx >= p.ne) { + return; + } + + const uint i3 = idx / (p.ne22*p.ne21*p.ne20); + const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20; + const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20); + const uint i2_offset = i2*p.ne21*p.ne20; + const uint i1 = (idx - i3_offset - i2_offset) / p.ne20; + const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20; + + uint o[4] = {0, 0, 0, 0}; + o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03)); + + const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; + const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10; + const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20; + + const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; + +#ifndef OPTIMIZATION_ERROR_WORKAROUND + data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]); +#else + data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx]; +#endif +} diff --git a/ggml/src/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp index efb55876..c26917c0 100644 --- a/ggml/src/vulkan-shaders/copy.comp +++ b/ggml/src/vulkan-shaders/copy.comp @@ -4,13 +4,15 @@ #include "generic_unary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } #ifndef OPTIMIZATION_ERROR_WORKAROUND - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]); #else - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)]; + data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)]; #endif } diff --git a/ggml/src/vulkan-shaders/div.comp b/ggml/src/vulkan-shaders/div.comp index 8ee4bfc7..8cfce58b 100644 --- a/ggml/src/vulkan-shaders/div.comp +++ b/ggml/src/vulkan-shaders/div.comp @@ -4,9 +4,11 @@ #include "generic_binary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)])); } diff --git a/ggml/src/vulkan-shaders/gelu.comp b/ggml/src/vulkan-shaders/gelu.comp index 9fe807cc..4cc7a68c 100644 --- a/ggml/src/vulkan-shaders/gelu.comp +++ b/ggml/src/vulkan-shaders/gelu.comp @@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; void main() { const float GELU_COEF_A = 0.044715f; const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - const uint i = gl_GlobalInvocationID.x; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; if (i >= p.KX) { return; diff --git a/ggml/src/vulkan-shaders/gelu_quick.comp b/ggml/src/vulkan-shaders/gelu_quick.comp new file mode 100644 index 00000000..e6e6fcfd --- /dev/null +++ b/ggml/src/vulkan-shaders/gelu_quick.comp @@ -0,0 +1,23 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const float GELU_QUICK_COEF = -1.702f; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + + const float x = float(data_a[i]); + data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)))); +} diff --git a/ggml/src/vulkan-shaders/generic_binary_head.comp b/ggml/src/vulkan-shaders/generic_binary_head.comp index ab45d256..b6beaff1 100644 --- a/ggml/src/vulkan-shaders/generic_binary_head.comp +++ b/ggml/src/vulkan-shaders/generic_binary_head.comp @@ -7,7 +7,7 @@ layout (push_constant) uniform parameter uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23; uint d_offset; - float param1; float param2; + float param1; float param2; int param3; } p; layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; @@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; +uint get_idx() { + return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; +} + uint src0_idx(uint idx) { const uint i03 = idx / (p.ne02*p.ne01*p.ne00); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; diff --git a/ggml/src/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp index de08de7c..eacdefc7 100644 --- a/ggml/src/vulkan-shaders/generic_unary_head.comp +++ b/ggml/src/vulkan-shaders/generic_unary_head.comp @@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; +uint get_idx() { + return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; +} + uint src0_idx(uint idx) { const uint i03 = idx / (p.ne02*p.ne01*p.ne00); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; diff --git a/ggml/src/vulkan-shaders/group_norm.comp b/ggml/src/vulkan-shaders/group_norm.comp new file mode 100644 index 00000000..5ad9b28d --- /dev/null +++ b/ggml/src/vulkan-shaders/group_norm.comp @@ -0,0 +1,66 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable +#define BLOCK_SIZE 512 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +shared float tmp[BLOCK_SIZE]; + +void main() { + const uint group_size = p.KX; + const float eps = p.param1; + + const uint tid = gl_LocalInvocationID.x; + const uint start = gl_WorkGroupID.x * group_size + tid; + const uint end = start + group_size; + + tmp[tid] = 0.0f; + + // Calculate mean + [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { + tmp[tid] += float(data_a[col]); + } + + // tmp up partial tmps and write back result + barrier(); + [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(); + } + + const float mean = tmp[0] / group_size; + barrier(); + tmp[tid] = 0.0f; + + // Calculate variance + [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { + const float xi = float(data_a[col]) - mean; + data_d[col] = D_TYPE(xi); + tmp[tid] += xi * xi; + } + + // sum up partial sums and write back result + barrier(); + [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(); + } + + const float variance = tmp[0] / group_size; + const float scale = inversesqrt(variance + eps); + + [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { + data_d[col] *= D_TYPE(scale); + } +} diff --git a/ggml/src/vulkan-shaders/im2col.comp b/ggml/src/vulkan-shaders/im2col.comp new file mode 100644 index 00000000..4d48610a --- /dev/null +++ b/ggml/src/vulkan-shaders/im2col.comp @@ -0,0 +1,57 @@ +#version 450 + +#extension GL_EXT_shader_16bit_storage : require + +layout (push_constant) uniform parameter +{ + uint batch_offset; uint offset_delta; + uint IC; + uint IW; uint IH; + uint OW; uint OH; + uint KW; uint KH; + uint pelements; + uint CHW; + int s0; int s1; + int p0; int p1; + int d0; int d1; +} p; + +#include "types.comp" + +#define BLOCK_SIZE 256 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_GlobalInvocationID.x; + if (i >= p.pelements) { + return; + } + + const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1); + const uint kx = i / ksize; + const uint kd = kx * ksize; + const uint ky = (i - kd) / p.OW; + const uint ix = i % p.OW; + + const uint oh = gl_GlobalInvocationID.y; + const uint batch = gl_GlobalInvocationID.z / p.IC; + const uint ic = gl_GlobalInvocationID.z % p.IC; + + const uint iiw = ix * p.s0 + kx * p.d0 - p.p0; + const uint iih = oh * p.s1 + ky * p.d1 - p.p1; + + const uint offset_dst = + ((batch * p.OH + oh) * p.OW + ix) * p.CHW + + (ic * (p.KW * p.KH) + ky * p.KW + kx); + + if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) { + data_d[offset_dst] = D_TYPE(0.0f); + } else { + const uint offset_src = ic * p.offset_delta + batch * p.batch_offset; + data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]); + } +} diff --git a/ggml/src/vulkan-shaders/leaky_relu.comp b/ggml/src/vulkan-shaders/leaky_relu.comp new file mode 100644 index 00000000..d90a99ae --- /dev/null +++ b/ggml/src/vulkan-shaders/leaky_relu.comp @@ -0,0 +1,22 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + + const float val = float(data_a[i]); + data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1); +} diff --git a/ggml/src/vulkan-shaders/mul.comp b/ggml/src/vulkan-shaders/mul.comp index bbb0aa1d..bfb61c92 100644 --- a/ggml/src/vulkan-shaders/mul.comp +++ b/ggml/src/vulkan-shaders/mul.comp @@ -4,9 +4,11 @@ #include "generic_binary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)])); } diff --git a/ggml/src/vulkan-shaders/mul_mat_vec.comp b/ggml/src/vulkan-shaders/mul_mat_vec.comp index 15d2a806..46a6369b 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec.comp @@ -16,6 +16,13 @@ void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; const uint tid = gl_LocalInvocationID.x; + // There are not enough cols to use all threads + if (tid >= p.ncols) { + return; + } + + const uint block_size = min(p.ncols, BLOCK_SIZE); + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -23,8 +30,8 @@ void main() { tmp[tid] = FLOAT_TYPE(0.0f); - [[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) { - const uint col = i*BLOCK_SIZE + 2*tid; + [[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) { + const uint col = i*block_size + 2*tid; const uint ib = (row*p.ncols + col)/QUANT_K; // block index const uint iqs = (col%QUANT_K)/QUANT_R; // quant index const uint iybs = col - col%QUANT_K; // y block start index @@ -38,7 +45,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { + [[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } diff --git a/ggml/src/vulkan-shaders/norm.comp b/ggml/src/vulkan-shaders/norm.comp index 803dbdcb..6627a50b 100644 --- a/ggml/src/vulkan-shaders/norm.comp +++ b/ggml/src/vulkan-shaders/norm.comp @@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; shared vec2 sum[BLOCK_SIZE]; void main() { - const uint row = gl_WorkGroupID.x; + const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; sum[tid] = vec2(0.0f, 0.0f); diff --git a/ggml/src/vulkan-shaders/pad.comp b/ggml/src/vulkan-shaders/pad.comp new file mode 100644 index 00000000..a465cd52 --- /dev/null +++ b/ggml/src/vulkan-shaders/pad.comp @@ -0,0 +1,26 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +void main() { + const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (idx >= p.ne) { + return; + } + + const uint i3 = idx / (p.ne12*p.ne11*p.ne10); + const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; + const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10); + const uint i2_offset = i2*p.ne11*p.ne10; + const uint i1 = (idx - i3_offset - i2_offset) / p.ne10; + const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; + + const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; + const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10; + + const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; + + data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f); +} diff --git a/ggml/src/vulkan-shaders/relu.comp b/ggml/src/vulkan-shaders/relu.comp index 7e5baa5b..52a19b62 100644 --- a/ggml/src/vulkan-shaders/relu.comp +++ b/ggml/src/vulkan-shaders/relu.comp @@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; void main() { - const uint i = gl_GlobalInvocationID.x; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; if (i >= p.KX) { return; diff --git a/ggml/src/vulkan-shaders/rms_norm.comp b/ggml/src/vulkan-shaders/rms_norm.comp index cfd08d34..b554400b 100644 --- a/ggml/src/vulkan-shaders/rms_norm.comp +++ b/ggml/src/vulkan-shaders/rms_norm.comp @@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; shared FLOAT_TYPE sum[BLOCK_SIZE]; void main() { - const uint row = gl_WorkGroupID.x; + const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp diff --git a/ggml/src/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp index 510cb723..5cd2f668 100644 --- a/ggml/src/vulkan-shaders/scale.comp +++ b/ggml/src/vulkan-shaders/scale.comp @@ -4,9 +4,11 @@ #include "generic_unary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1)); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1)); } diff --git a/ggml/src/vulkan-shaders/silu.comp b/ggml/src/vulkan-shaders/silu.comp index 15920f06..4d36f88e 100644 --- a/ggml/src/vulkan-shaders/silu.comp +++ b/ggml/src/vulkan-shaders/silu.comp @@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; void main() { - const uint i = gl_GlobalInvocationID.x; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; if (i >= p.KX) { return; diff --git a/ggml/src/vulkan-shaders/soft_max.comp b/ggml/src/vulkan-shaders/soft_max.comp index 1b8419c7..0bd51eca 100644 --- a/ggml/src/vulkan-shaders/soft_max.comp +++ b/ggml/src/vulkan-shaders/soft_max.comp @@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE]; void main() { const uint tid = gl_LocalInvocationID.x; - const uint rowx = gl_WorkGroupID.x; + const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint rowy = rowx % p.KY; float slope = 1.0f; diff --git a/ggml/src/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp index 8dd19333..1fa118c9 100644 --- a/ggml/src/vulkan-shaders/square.comp +++ b/ggml/src/vulkan-shaders/square.comp @@ -4,10 +4,12 @@ #include "generic_unary_head.comp" void main() { - if (gl_GlobalInvocationID.x >= p.ne) { + const uint idx = get_idx(); + + if (idx >= p.ne) { return; } - const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val); + const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val); } diff --git a/ggml/src/vulkan-shaders/sum_rows.comp b/ggml/src/vulkan-shaders/sum_rows.comp index ce2f1e2f..961e5ffa 100644 --- a/ggml/src/vulkan-shaders/sum_rows.comp +++ b/ggml/src/vulkan-shaders/sum_rows.comp @@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32; shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { - const uint row = gl_WorkGroupID.x; + const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint col = gl_LocalInvocationID.x; tmp[col] = FLOAT_TYPE(0.0f); diff --git a/ggml/src/vulkan-shaders/tanh.comp b/ggml/src/vulkan-shaders/tanh.comp new file mode 100644 index 00000000..74630dc7 --- /dev/null +++ b/ggml/src/vulkan-shaders/tanh.comp @@ -0,0 +1,21 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + + data_d[i] = D_TYPE(tanh(data_a[i])); +} diff --git a/ggml/src/vulkan-shaders/timestep_embedding.comp b/ggml/src/vulkan-shaders/timestep_embedding.comp new file mode 100644 index 00000000..79e065a9 --- /dev/null +++ b/ggml/src/vulkan-shaders/timestep_embedding.comp @@ -0,0 +1,41 @@ +#version 450 + +#extension GL_EXT_shader_16bit_storage : require + +layout (push_constant) uniform parameter +{ + uint nb1; + uint dim; + uint max_period; +} p; + +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable +#define BLOCK_SIZE 256 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint i = gl_WorkGroupID.y; + const uint j = gl_GlobalInvocationID.x; + const uint d_offset = i * p.nb1; + + if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) { + data_d[d_offset + p.dim] = 0.f; + } + + const uint half_dim = p.dim / 2; + if (j >= half_dim) { + return; + } + + const float timestep = float(data_a[i]); + const float freq = float(exp(-log(p.max_period) * j / half_dim)); + const float arg = timestep * freq; + data_d[d_offset + j] = D_TYPE(cos(arg)); + data_d[d_offset + j + half_dim] = D_TYPE(sin(arg)); +} diff --git a/ggml/src/vulkan-shaders/types.comp b/ggml/src/vulkan-shaders/types.comp index d24c172c..21dce72f 100644 --- a/ggml/src/vulkan-shaders/types.comp +++ b/ggml/src/vulkan-shaders/types.comp @@ -6,7 +6,7 @@ #define QUANT_K 1 #define QUANT_R 1 -#ifndef LOAD_VEC_A +#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 #define A_TYPE float #elif LOAD_VEC_A == 4 #define A_TYPE vec4 @@ -19,7 +19,7 @@ #define QUANT_K 1 #define QUANT_R 1 -#ifndef LOAD_VEC_A +#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 #define A_TYPE float16_t #elif LOAD_VEC_A == 4 #define A_TYPE f16vec4 diff --git a/ggml/src/vulkan-shaders/upscale.comp b/ggml/src/vulkan-shaders/upscale.comp new file mode 100644 index 00000000..511a086e --- /dev/null +++ b/ggml/src/vulkan-shaders/upscale.comp @@ -0,0 +1,36 @@ +#version 450 + +layout (push_constant) uniform parameter +{ + uint ne; uint d_offset; + uint nb00; uint nb01; uint nb02; uint nb03; + uint ne10; uint ne11; uint ne12; uint ne13; + float sf0; float sf1; float sf2; float sf3; +} p; + +#include "types.comp" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (idx >= p.ne) { + return; + } + + const uint i10 = idx % p.ne10; + const uint i11 = (idx / p.ne10) % p.ne11; + const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12; + const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13; + + const uint i00 = uint(i10 / p.sf0); + const uint i01 = uint(i11 / p.sf1); + const uint i02 = uint(i12 / p.sf2); + const uint i03 = uint(i13 / p.sf3); + + data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]); +} diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index c5be3754..a792e203 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -22,6 +22,7 @@ #ifdef _WIN32 #include <windows.h> #include <direct.h> // For _mkdir on Windows + #include <algorithm> // For std::replace on w64devkit #else #include <unistd.h> #include <sys/wait.h> @@ -179,11 +180,7 @@ bool string_ends_with(const std::string& str, const std::string& suffix) { return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin()); } -#ifdef _WIN32 - static const char path_separator = '\\'; -#else - static const char path_separator = '/'; -#endif +static const char path_separator = '/'; std::string join_paths(const std::string& path1, const std::string& path2) { return path1 + path_separator + path2; @@ -198,7 +195,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const std::string out_fname = join_paths(output_dir, name + ".spv"); std::string in_path = join_paths(input_dir, in_fname); - std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname}; + #ifdef _WIN32 + std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; + #else + std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname}; + #endif for (const auto& define : defines) { cmd.push_back("-D" + define.first + "=" + define.second); } @@ -269,9 +270,12 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu for (const auto& tname : type_names) { std::string data_a_key = "DATA_A_" + to_uppercase(tname); + // For unaligned, load one at a time for f32/f16, or two at a time for quants + std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2"; + // For aligned matmul loads std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2"; tasks.push_back(std::async(std::launch::async, [=] { - string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); + string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); })); tasks.push_back(std::async(std::launch::async, [=] { string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16); @@ -341,6 +345,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) { string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); })); tasks.push_back(std::async(std::launch::async, [=] { + string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + })); + tasks.push_back(std::async(std::launch::async, [=] { string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); })); @@ -357,6 +364,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) { tasks.push_back(std::async(std::launch::async, [] { string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); + })); tasks.push_back(std::async(std::launch::async, [] { string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); @@ -383,14 +393,41 @@ void process_shaders(std::vector<std::future<void>>& tasks) { })); tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}); + })); + + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + + tasks.push_back(std::async(std::launch::async, [] { string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); })); tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); })); tasks.push_back(std::async(std::launch::async, [] { string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); tasks.push_back(std::async(std::launch::async, [] { string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); @@ -424,6 +461,17 @@ void process_shaders(std::vector<std::future<void>>& tasks) { tasks.push_back(std::async(std::launch::async, [=] { string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); })); + + tasks.push_back(std::async(std::launch::async, [=] { + string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + })); + tasks.push_back(std::async(std::launch::async, [=] { + string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}})); + })); + + tasks.push_back(std::async(std::launch::async, [=] { + string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + })); } void write_output_files() { @@ -435,10 +483,16 @@ void write_output_files() { for (const auto& pair : shader_fnames) { const std::string& name = pair.first; - const std::string& path = pair.second; + #ifdef _WIN32 + std::string path = pair.second; + std::replace(path.begin(), path.end(), '/', '\\' ); + #else + const std::string& path = pair.second; + #endif + FILE* spv = fopen(path.c_str(), "rb"); if (!spv) { - std::cerr << "Error opening SPIR-V file: " << path << "\n"; + std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n"; continue; } @@ -450,7 +504,7 @@ void write_output_files() { size_t read_size = fread(data.data(), 1, size, spv); fclose(spv); if (read_size != size) { - std::cerr << "Error reading SPIR-V file: " << path << "\n"; + std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n"; continue; } |