summaryrefslogtreecommitdiff
path: root/ggml
diff options
context:
space:
mode:
Diffstat (limited to 'ggml')
-rw-r--r--ggml/CMakeLists.txt12
-rw-r--r--ggml/include/ggml-cuda.h3
-rw-r--r--ggml/include/ggml-metal.h2
-rw-r--r--ggml/include/ggml.h47
-rw-r--r--ggml/src/CMakeLists.txt75
-rw-r--r--ggml/src/ggml-aarch64.c30
-rw-r--r--ggml/src/ggml-alloc.c42
-rw-r--r--ggml/src/ggml-backend.c239
-rw-r--r--ggml/src/ggml-blas.cpp3
-rw-r--r--ggml/src/ggml-cann.cpp129
-rw-r--r--ggml/src/ggml-cann/acl_tensor.cpp31
-rw-r--r--ggml/src/ggml-cann/acl_tensor.h36
-rw-r--r--ggml/src/ggml-cann/aclnn_ops.cpp244
-rw-r--r--ggml/src/ggml-cann/kernels/CMakeLists.txt3
-rw-r--r--ggml/src/ggml-cann/kernels/ascendc_kernels.h2
-rw-r--r--ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp278
-rw-r--r--ggml/src/ggml-common.h6
-rw-r--r--ggml/src/ggml-cuda.cu91
-rw-r--r--ggml/src/ggml-cuda/argsort.cu2
-rw-r--r--ggml/src/ggml-cuda/binbcast.cu2
-rw-r--r--ggml/src/ggml-cuda/common.cuh208
-rw-r--r--ggml/src/ggml-cuda/convert.cu1
-rw-r--r--ggml/src/ggml-cuda/cpy.cu4
-rw-r--r--ggml/src/ggml-cuda/dmmv.cu23
-rw-r--r--ggml/src/ggml-cuda/dmmv.cuh2
-rw-r--r--ggml/src/ggml-cuda/fattn-common.cuh6
-rw-r--r--ggml/src/ggml-cuda/fattn-tile-f16.cu2
-rw-r--r--ggml/src/ggml-cuda/fattn-tile-f32.cu2
-rw-r--r--ggml/src/ggml-cuda/fattn.cu10
-rw-r--r--ggml/src/ggml-cuda/getrows.cu3
-rw-r--r--ggml/src/ggml-cuda/iqk_mmvq.cu1
-rw-r--r--ggml/src/ggml-cuda/mmq.cu2
-rw-r--r--ggml/src/ggml-cuda/mmq.cuh4
-rw-r--r--ggml/src/ggml-cuda/mmvq.cu6
-rw-r--r--ggml/src/ggml-cuda/norm.cu9
-rw-r--r--ggml/src/ggml-cuda/quantize.cu2
-rw-r--r--ggml/src/ggml-cuda/rope.cu4
-rw-r--r--ggml/src/ggml-cuda/vendors/cuda.h14
-rw-r--r--ggml/src/ggml-cuda/vendors/hip.h177
-rw-r--r--ggml/src/ggml-cuda/vendors/musa.h171
-rw-r--r--ggml/src/ggml-impl.h126
-rw-r--r--ggml/src/ggml-kompute.cpp8
-rw-r--r--ggml/src/ggml-metal.m117
-rw-r--r--ggml/src/ggml-quants.c60
-rw-r--r--ggml/src/ggml-quants.h4
-rw-r--r--ggml/src/ggml-rpc.cpp36
-rw-r--r--ggml/src/ggml-sycl.cpp32
-rw-r--r--ggml/src/ggml-sycl/backend.hpp2
-rw-r--r--ggml/src/ggml-sycl/common.hpp2
-rw-r--r--ggml/src/ggml-sycl/conv.cpp99
-rw-r--r--ggml/src/ggml-sycl/conv.hpp21
-rw-r--r--ggml/src/ggml-sycl/dmmv.cpp2
-rw-r--r--ggml/src/ggml-sycl/dpct/helper.hpp21
-rw-r--r--ggml/src/ggml-sycl/mmq.cpp22
-rw-r--r--ggml/src/ggml-sycl/mmvq.cpp4
-rw-r--r--ggml/src/ggml-sycl/norm.cpp9
-rw-r--r--ggml/src/ggml-sycl/presets.hpp2
-rw-r--r--ggml/src/ggml-sycl/rope.cpp4
-rw-r--r--ggml/src/ggml-sycl/tsembd.cpp71
-rw-r--r--ggml/src/ggml-sycl/tsembd.hpp21
-rw-r--r--ggml/src/ggml-vulkan.cpp965
-rw-r--r--ggml/src/ggml.c985
-rw-r--r--ggml/src/vulkan-shaders/CMakeLists.txt2
-rw-r--r--ggml/src/vulkan-shaders/add.comp6
-rw-r--r--ggml/src/vulkan-shaders/clamp.comp8
-rw-r--r--ggml/src/vulkan-shaders/concat.comp35
-rw-r--r--ggml/src/vulkan-shaders/copy.comp8
-rw-r--r--ggml/src/vulkan-shaders/div.comp6
-rw-r--r--ggml/src/vulkan-shaders/gelu.comp2
-rw-r--r--ggml/src/vulkan-shaders/gelu_quick.comp23
-rw-r--r--ggml/src/vulkan-shaders/generic_binary_head.comp6
-rw-r--r--ggml/src/vulkan-shaders/generic_unary_head.comp4
-rw-r--r--ggml/src/vulkan-shaders/group_norm.comp66
-rw-r--r--ggml/src/vulkan-shaders/im2col.comp57
-rw-r--r--ggml/src/vulkan-shaders/leaky_relu.comp22
-rw-r--r--ggml/src/vulkan-shaders/mul.comp6
-rw-r--r--ggml/src/vulkan-shaders/mul_mat_vec.comp13
-rw-r--r--ggml/src/vulkan-shaders/norm.comp2
-rw-r--r--ggml/src/vulkan-shaders/pad.comp26
-rw-r--r--ggml/src/vulkan-shaders/relu.comp2
-rw-r--r--ggml/src/vulkan-shaders/rms_norm.comp2
-rw-r--r--ggml/src/vulkan-shaders/scale.comp6
-rw-r--r--ggml/src/vulkan-shaders/silu.comp2
-rw-r--r--ggml/src/vulkan-shaders/soft_max.comp2
-rw-r--r--ggml/src/vulkan-shaders/square.comp8
-rw-r--r--ggml/src/vulkan-shaders/sum_rows.comp2
-rw-r--r--ggml/src/vulkan-shaders/tanh.comp21
-rw-r--r--ggml/src/vulkan-shaders/timestep_embedding.comp41
-rw-r--r--ggml/src/vulkan-shaders/types.comp4
-rw-r--r--ggml/src/vulkan-shaders/upscale.comp36
-rw-r--r--ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp74
91 files changed, 3527 insertions, 1506 deletions
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 66505753..1bb3d1ee 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -50,9 +50,15 @@ else()
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif()
+if (CMAKE_CROSSCOMPILING)
+ set(GGML_NATIVE_DEFAULT OFF)
+else()
+ set(GGML_NATIVE_DEFAULT ON)
+endif()
+
# general
option(GGML_STATIC "ggml: static link libraries" OFF)
-option(GGML_NATIVE "ggml: enable -march=native flag" ON)
+option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON)
@@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
# instruction set specific
-if (GGML_NATIVE)
+if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
set(INS_ENB OFF)
else()
set(INS_ENB ON)
@@ -108,6 +114,7 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
option(GGML_IQK_MUL_MAT "ggml: use optimized iqk matrix multiplications" ON)
option(GGML_CUDA "ggml: use CUDA" OFF)
+option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
@@ -197,6 +204,7 @@ set(GGML_PUBLIC_HEADERS
include/ggml-alloc.h
include/ggml-backend.h
include/ggml-blas.h
+ include/ggml-cann.h
include/ggml-cuda.h
include/ggml.h
include/ggml-kompute.h
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index d7903c66..71bb6dcf 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -6,6 +6,9 @@
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
+#elif defined(GGML_USE_MUSA)
+#define GGML_CUDA_NAME "MUSA"
+#define GGML_CUBLAS_NAME "muBLAS"
#else
#define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS"
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index 6c3226c3..d483cf1a 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -50,6 +50,8 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index a0bcc67f..b9b0284b 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -254,18 +254,8 @@
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
-#define GGML_ASSERT(x) \
- do { \
- if (!(x)) { \
- fflush(stdout); \
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
- ggml_print_backtrace(); \
- abort(); \
- } \
- } while (0)
-
#ifndef NDEBUG
-#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
#elif defined(__GNUC__)
#define GGML_UNREACHABLE() __builtin_unreachable()
#elif defined(_MSC_VER)
@@ -274,6 +264,17 @@
#define GGML_UNREACHABLE() ((void) 0)
#endif
+#ifdef __cplusplus
+#define GGML_NORETURN [[noreturn]]
+#elif defined(_MSC_VER)
+#define GGML_NORETURN __declspec(noreturn)
+#else
+#define GGML_NORETURN _Noreturn
+#endif
+
+#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
+
// used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability.
//
@@ -322,6 +323,9 @@
extern "C" {
#endif
+ GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
+ GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
+
enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1,
@@ -345,6 +349,7 @@ extern "C" {
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
+ GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
struct ggml_object;
@@ -653,8 +658,11 @@ extern "C" {
GGML_CGRAPH_EVAL_ORDER_COUNT
};
+ typedef uint32_t ggml_bitset_t;
+
struct ggml_hash_set {
size_t size;
+ ggml_bitset_t * used;
struct ggml_tensor ** keys;
};
@@ -668,7 +676,7 @@ extern "C" {
struct ggml_tensor ** grads;
struct ggml_tensor ** leafs;
- struct ggml_hash_set visited_hash_table;
+ struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order;
};
@@ -715,8 +723,6 @@ extern "C" {
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);
- GGML_API void ggml_print_backtrace(void);
-
// accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
@@ -1151,16 +1157,17 @@ extern "C" {
// group normalize along ne0*ne1*n_groups
// used in stable-diffusion
- // TODO: eps is hardcoded to 1e-6 for now
GGML_API struct ggml_tensor * ggml_group_norm(
struct ggml_context * ctx,
struct ggml_tensor * a,
- int n_groups);
+ int n_groups,
+ float eps);
GGML_API struct ggml_tensor * ggml_group_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
- int n_groups);
+ int n_groups,
+ float eps);
// a - x
// b - dy
@@ -1467,7 +1474,6 @@ extern "C" {
// if mode & 2 == 1, GPT-NeoX style
//
// b is an int32 vector with size a->ne[2], it contains the positions
- // c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -1484,6 +1490,7 @@ extern "C" {
int mode);
// custom RoPE
+ // c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -2022,8 +2029,8 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
- GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+ GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 0b1ad48a..be6c2cc6 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -139,6 +139,17 @@ if (GGML_METAL)
)
endif()
+if (GGML_MUSA)
+ set(CMAKE_C_COMPILER clang)
+ set(CMAKE_C_EXTENSIONS OFF)
+ set(CMAKE_CXX_COMPILER clang++)
+ set(CMAKE_CXX_EXTENSIONS OFF)
+
+ set(GGML_CUDA ON)
+
+ list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
+endif()
+
if (GGML_OPENMP)
find_package(OpenMP)
if (OpenMP_FOUND)
@@ -147,6 +158,11 @@ if (GGML_OPENMP)
add_compile_definitions(GGML_USE_OPENMP)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+
+ if (GGML_MUSA)
+ set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
+ endif()
else()
message(WARNING "OpenMP not found")
endif()
@@ -257,11 +273,16 @@ endif()
if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
- find_package(CUDAToolkit)
-
- set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0)
- set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0)
- set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_OBJECTS 0)
+ if (GGML_MUSA)
+ list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
+ find_package(MUSAToolkit)
+ set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
+ else()
+ find_package(CUDAToolkit)
+ set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0)
+ set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0)
+ set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_OBJECTS 0)
+ endif()
if (CUDAToolkit_FOUND)
message(STATUS "CUDA found")
@@ -280,7 +301,11 @@ if (GGML_CUDA)
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
- enable_language(CUDA)
+ if (GGML_MUSA)
+ set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
+ else()
+ enable_language(CUDA)
+ endif()
file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
@@ -344,21 +369,40 @@ if (GGML_CUDA)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
+ if (GGML_MUSA)
+ set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
+ foreach(SOURCE ${GGML_SOURCES_CUDA})
+ set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22")
+ endforeach()
+ endif()
+
if (GGML_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
- set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+ if (GGML_MUSA)
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
+ else()
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+ endif()
endif()
else()
- set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+ if (GGML_MUSA)
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
+ else()
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+ endif()
endif()
if (GGML_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else()
- set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+ if (GGML_MUSA)
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
+ else()
+ set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+ endif()
endif()
else()
message(WARNING "CUDA not found")
@@ -816,11 +860,6 @@ if (GGML_CANN)
${CANN_INSTALL_DIR}/acllib/include
)
- # TODO: find libs
- link_directories(
- ${CANN_INSTALL_DIR}/lib64
- )
-
add_subdirectory(ggml-cann/kernels)
list(APPEND CANN_LIBRARIES
ascendcl
@@ -839,6 +878,7 @@ if (GGML_CANN)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
+ set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
endif()
else()
@@ -869,8 +909,10 @@ function(get_flags CCID CCVER)
set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds)
- if (CCVER VERSION_GREATER_EQUAL 7.1.0)
- list(APPEND CXX_FLAGS -Wno-format-truncation)
+ if (NOT GGML_MUSA)
+ if (CCVER VERSION_GREATER_EQUAL 7.1.0)
+ list(APPEND CXX_FLAGS -Wno-format-truncation)
+ endif()
endif()
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi)
@@ -1278,6 +1320,7 @@ endif()
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
target_include_directories(ggml PUBLIC ../include)
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
+target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
target_compile_features (ggml PRIVATE c_std_11) # don't bump
target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index af53dea1..7adaadc9 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -16,6 +16,8 @@
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Woverlength-strings"
+#elif defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define UNUSED GGML_UNUSED
@@ -384,8 +386,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE)
- if (svcntw() == 8) {
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+ if (ggml_sve_cnt_b == QK8_0) {
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
@@ -496,8 +498,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE)
- if (svcntw() == 8) {
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+ if (ggml_sve_cnt_b == QK8_0) {
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
@@ -614,7 +616,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
- if (svcntw() == 8) {
+ if (ggml_sve_cnt_b == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
@@ -680,12 +682,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
return;
}
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
- GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+ GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
"performance");
}
else if (ggml_cpu_has_neon()) {
- GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+ GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
"quantization format for optimal performance");
}
@@ -745,8 +747,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
- if (svcntw() == 8) {
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+ if (ggml_sve_cnt_b == QK8_0) {
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
@@ -1266,8 +1268,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
- if (svcntw() == 8) {
- GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+ if (ggml_sve_cnt_b == QK8_0) {
+ GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
@@ -1728,7 +1730,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
- if (svcntw() == 8) {
+ if (ggml_sve_cnt_b == QK8_0) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
@@ -2139,12 +2141,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
return;
}
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
- GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+ GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
"performance");
}
else if (ggml_cpu_has_neon()) {
- GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+ GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
"quantization format for optimal performance");
}
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index e176b883..e485326a 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
- GGML_ASSERT(!"not enough space in the buffer");
- return;
+ GGML_ABORT("not enough space in the buffer");
}
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
return;
}
}
- GGML_ASSERT(!"out of allocated_tensors");
+ GGML_ABORT("out of allocated_tensors");
}
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
return;
}
}
- fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
- GGML_ASSERT(!"tensor not found");
+ GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
}
#endif
@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
// this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
- GGML_ASSERT(!"not enough space in the buffer");
- GGML_UNREACHABLE();
+ GGML_ABORT("not enough space in the buffer");
}
}
@@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
}
}
- free(galloc->hash_set.keys);
+ ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
free(galloc->buffers);
@@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
typedef struct ggml_gallocr * ggml_gallocr_t;
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
- size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
+ size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
return &galloc->hash_values[i];
}
@@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables
- memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
- memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
+ ggml_hash_set_reset(&galloc->hash_set);
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
// allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
}
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
- size_t hash_size = graph->visited_hash_table.size;
+ size_t min_hash_size = graph->n_nodes + graph->n_leafs;
+ // add 25% margin to avoid hash collisions
+ min_hash_size += min_hash_size / 4;
// initialize hash table
- if (galloc->hash_set.size < hash_size) {
- free(galloc->hash_set.keys);
- free(galloc->hash_values);
- galloc->hash_set.size = hash_size;
- galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
- galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
+ if (galloc->hash_set.size < min_hash_size) {
+ ggml_hash_set_free(&galloc->hash_set);
+ galloc->hash_set = ggml_hash_set_new(min_hash_size);
GGML_ASSERT(galloc->hash_set.keys != NULL);
+
+ free(galloc->hash_values);
+ galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
GGML_ASSERT(galloc->hash_values != NULL);
- } else {
- // reset hash table
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
}
// reset allocators
@@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
}
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
- ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
- size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
return talloc->size_max >= node_size;
}
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index d39cfed8..e1651cc6 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
}
// an async copy would normally happen after all the queued operations on both backends are completed
- // sync src, set_async dst
- if (ggml_backend_buffer_is_host(src->buffer)) {
- ggml_backend_synchronize(backend_src);
- ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
- } else {
- ggml_backend_synchronize(backend_src);
- ggml_backend_tensor_copy(src, dst);
- ggml_backend_synchronize(backend_dst);
- }
+ // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
+ ggml_backend_synchronize(backend_src);
+ ggml_backend_synchronize(backend_dst);
+ ggml_backend_tensor_copy(src, dst);
}
// events
@@ -1055,11 +1050,10 @@ struct ggml_backend_sched {
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
ggml_gallocr_t galloc;
- // hash keys of the nodes in the graph
- struct ggml_hash_set hash_set;
- // hash values
- int * tensor_backend_id;
- struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
+ // hash map of the nodes in the graph
+ struct ggml_hash_set hash_set;
+ int * hv_tensor_backend_ids; // [hash_set.size]
+ struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
int * node_backend_ids; // [graph_size]
int * leaf_backend_ids; // [graph_size]
@@ -1068,7 +1062,7 @@ struct ggml_backend_sched {
int * prev_leaf_backend_ids; // [graph_size]
// copy of the graph with modified inputs
- struct ggml_cgraph * graph;
+ struct ggml_cgraph graph;
// graph splits
struct ggml_backend_sched_split * splits;
@@ -1087,19 +1081,16 @@ struct ggml_backend_sched {
ggml_backend_sched_eval_callback callback_eval;
void * callback_eval_user_data;
- bool debug;
+ char * context_buffer;
+ size_t context_buffer_size;
- // align context_buffer to GGML_MEM_ALIGN
-#ifdef _MSC_VER
- __declspec(align(GGML_MEM_ALIGN))
-#else
- __attribute__((aligned(GGML_MEM_ALIGN)))
-#endif
- char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
+ bool debug;
};
-#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
-#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
+#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
+#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
+#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
+#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
// returns the priority of the backend, lower id is higher priority
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1169,7 +1160,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
return cur_backend_id;
}
- // assign nodes that use weights to the backend of the weights
// operations with weights are preferably run on the same backend as the weights
for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = tensor->src[i];
@@ -1275,7 +1265,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->is_reset = false;
struct ggml_init_params params = {
- /* .mem_size = */ sizeof(sched->context_buffer),
+ /* .mem_size = */ sched->context_buffer_size,
/* .mem_buffer = */ sched->context_buffer,
/* .no_alloc = */ true
};
@@ -1284,39 +1274,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->ctx = ggml_init(params);
if (sched->ctx == NULL) {
- fprintf(stderr, "%s: failed to initialize context\n", __func__);
- GGML_ASSERT(false);
+ GGML_ABORT("%s: failed to initialize context\n", __func__);
}
// pass 1: assign backends to ops with pre-allocated inputs
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
int * leaf_backend_id = &tensor_backend_id(leaf);
- if (*leaf_backend_id != -1) {
- // do not overwrite user assignments
- continue;
+ // do not overwrite user assignments
+ if (*leaf_backend_id == -1) {
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
}
- *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
}
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
int * node_backend_id = &tensor_backend_id(node);
- if (*node_backend_id != -1) {
- // do not overwrite user assignments
- continue;
- }
- *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
- // src
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
+ // do not overwrite user assignments
+ if (*node_backend_id == -1) {
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
+
+#if 0
+ // src
+ if (node->op == GGML_OP_NONE) {
continue;
}
- int * src_backend_id = &tensor_backend_id(src);
- if (*src_backend_id == -1) {
- *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
+
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ continue;
+ }
+ int * src_backend_id = &tensor_backend_id(src);
+ if (*src_backend_id == -1) {
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
+ }
}
+#endif
}
}
@@ -1488,12 +1482,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
- // pass 4: split graph, find tensors that need to be copied
+ // pass 5: split graph, find tensors that need to be copied
{
int i_split = 0;
struct ggml_backend_sched_split * split = &sched->splits[0];
// find the backend of the first split, skipping view ops
- for (int i = 0; i < graph->n_nodes; i++) {
+ int i = 0;
+ for (; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (!ggml_is_view_op(node->op)) {
split->backend_id = tensor_backend_id(node);
@@ -1502,9 +1497,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
split->i_start = 0;
split->n_inputs = 0;
- memset(split->inputs, 0, sizeof(split->inputs)); //HACK
int cur_backend_id = split->backend_id;
- for (int i = 0; i < graph->n_nodes; i++) {
+ for (; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
@@ -1513,7 +1507,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int node_backend_id = tensor_backend_id(node);
- GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
+ assert(node_backend_id != -1); // all nodes should be assigned by now
// check if we should start a new split based on the sources of the current node
bool need_new_split = false;
@@ -1527,7 +1521,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// by starting a new split, the memory of the previously offloaded weights can be reused
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = tensor_backend_id(src);
- if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
+ if (src_backend_id != cur_backend_id) {
need_new_split = true;
break;
}
@@ -1536,9 +1530,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// FIXME: count the number of inputs instead of only checking when full
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src);
- int src_backend_id = sched->tensor_backend_id[id];
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true;
break;
@@ -1570,12 +1564,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue;
}
- const int src_backend_id = tensor_backend_id(src);
+ size_t src_id = hash_id(src);
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
assert(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
- size_t id = hash_id(src);
- if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy;
@@ -1589,7 +1583,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
- sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_graph_inputs = sched->n_graph_inputs++;
@@ -1598,11 +1592,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
- bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
- if (src_backend_id != cur_backend_id && !supported) {
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
// create a copy of the input in the split's backend
- const size_t id = hash_id(src);
- if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
ggml_backend_t backend = sched->backends[cur_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@@ -1611,14 +1603,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
- sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_inputs = split->n_inputs++;
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
split->inputs[n_inputs] = src;
}
- node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
}
}
}
@@ -1630,7 +1622,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_backend_sched_print_assignments(sched, graph);
}
- // swap node_backend_ids and leaf_backend_ids and prevs
+ // swap node_backend_ids and leaf _backend_ids with prevs
{
int * tmp = sched->node_backend_ids;
sched->node_backend_ids = sched->prev_node_backend_ids;
@@ -1641,9 +1633,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->prev_leaf_backend_ids = tmp;
}
- // create copies of the graph for each split
- // TODO: avoid this copy
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
+ int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
+ if (sched->graph.size < graph_size) {
+ sched->graph.size = graph_size;
+ sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
+ sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
+ GGML_ASSERT(sched->graph.nodes != NULL);
+ GGML_ASSERT(sched->graph.leafs != NULL);
+ }
+ sched->graph.n_nodes = 0;
+ sched->graph.n_leafs = 0;
+
+ struct ggml_cgraph * graph_copy = &sched->graph;
+
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
@@ -1654,12 +1656,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
struct ggml_tensor * input = split->inputs[j];
const size_t input_id = hash_id(input);
- struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
+ struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
// add a dependency to the input source so that it is not freed before the copy is done
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
input_dep->src[0] = input;
- sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
// add a dependency to the input copy so that it is allocated at the start of the split
@@ -1681,7 +1683,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
size_t id = hash_id(input);
int backend_id = tensor_backend_id(input);
for (int c = 0; c < sched->n_copies; c++) {
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
}
@@ -1694,7 +1696,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
struct ggml_tensor * input = split->inputs[j];
size_t id = hash_id(input);
for (int c = 0; c < sched->n_copies; c++) {
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
}
@@ -1708,13 +1710,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
}
-
- sched->graph = graph_copy;
}
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
bool backend_ids_changed = false;
- for (int i = 0; i < sched->graph->n_nodes; i++) {
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
backend_ids_changed = true;
@@ -1722,7 +1722,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
}
}
if (!backend_ids_changed) {
- for (int i = 0; i < sched->graph->n_leafs; i++) {
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
backend_ids_changed = true;
@@ -1732,14 +1732,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
}
// allocate graph
- if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
// the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG
- fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
+ fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+ ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
+ if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
return false;
}
@@ -1760,7 +1760,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
for (int j = 0; j < split->n_inputs; j++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
struct ggml_tensor * input = split->inputs[j];
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
+ struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
@@ -1777,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} else {
ggml_backend_synchronize(split_backend);
}
- ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+ ggml_backend_synchronize(input_backend);
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
+ } else {
+ ggml_backend_synchronize(split_backend);
+ }
+ ggml_backend_tensor_copy(input, input_cpy);
+ }
}
}
@@ -1846,21 +1856,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
+ sched->n_backends = n_backends;
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
// initialize hash table
- sched->hash_set = ggml_hash_set_new(graph_size);
- sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
- sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
+ sched->hash_set = ggml_hash_set_new(graph_size);
+ sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+ sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
- sched->n_backends = n_backends;
-
- sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
+ sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
+ sched->context_buffer = malloc(sched->context_buffer_size);
const int initial_splits_capacity = 16;
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
@@ -1895,37 +1907,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
+ ggml_hash_set_free(&sched->hash_set);
free(sched->splits);
- free(sched->hash_set.keys);
- free(sched->tensor_backend_id);
- free(sched->tensor_copies);
+ free(sched->hv_tensor_backend_ids);
+ free(sched->hv_tensor_copies);
free(sched->node_backend_ids);
free(sched->leaf_backend_ids);
free(sched->prev_node_backend_ids);
free(sched->prev_leaf_backend_ids);
+ free(sched->context_buffer);
+ free(sched->graph.nodes);
+ free(sched->graph.leafs);
free(sched);
}
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
// reset state for the next run
if (!sched->is_reset) {
- size_t hash_size = sched->hash_set.size;
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
-
+ ggml_hash_set_reset(&sched->hash_set);
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
sched->is_reset = true;
}
sched->is_alloc = false;
}
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
- GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
ggml_backend_sched_split_graph(sched, measure_graph);
- // TODO: extract this to a separate function
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
+ if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
return false;
}
@@ -1936,10 +1948,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
}
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
ggml_backend_sched_split_graph(sched, graph);
+
if (!ggml_backend_sched_alloc_splits(sched)) {
return false;
}
@@ -2009,6 +2022,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index;
SET_CAUSE(node, "usr");
+ sched->is_reset = false;
}
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
@@ -2051,9 +2065,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
GGML_ASSERT(src != NULL);
GGML_ASSERT(src->data && "graph must be allocated");
- size_t id = ggml_hash_insert(hash_set, src);
- if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
- return node_copies[ggml_hash_find(hash_set, src)];
+ size_t id = ggml_hash_insert(&hash_set, src);
+ if (id == GGML_HASHSET_ALREADY_EXISTS) {
+ return node_copies[ggml_hash_find(&hash_set, src)];
}
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
@@ -2078,7 +2092,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
return dst;
}
-static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
+static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
size_t id = ggml_hash_find(hash_set, src);
if (node_init[id]) {
return;
@@ -2105,10 +2119,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
}
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
- struct ggml_hash_set hash_set = {
- /* .size = */ graph->visited_hash_table.size,
- /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
- };
+ struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
@@ -2123,7 +2134,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
fprintf(stderr, "failed to allocate context for graph copy\n");
- free(hash_set.keys);
+ ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
ggml_free(ctx_allocated);
@@ -2146,7 +2157,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
if (buffer == NULL) {
fprintf(stderr, "failed to allocate buffer for graph copy\n");
- free(hash_set.keys);
+ ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
ggml_free(ctx_allocated);
@@ -2164,19 +2175,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
// copy data and init views
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- graph_copy_init_tensor(hash_set, node_copies, node_init, node);
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
}
// build graph copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
+ struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
graph_copy->nodes[i] = node_copy;
}
graph_copy->n_nodes = graph->n_nodes;
- free(hash_set.keys);
+ ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index a37aa407..71373173 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
break;
default:
- fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
- GGML_ASSERT(false);
+ GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
}
}
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index 9bf7e332..06930ba2 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
file, line);
GGML_CANN_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace
- GGML_ASSERT(!"CANN error");
+ GGML_ABORT("CANN error");
}
/**
@@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
// memory should always buffered. these memory may still needed by
// tasks in stream.
// TODO, fix me.
- GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
+ GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
}
};
@@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
const void* src,
void* dst) {
- GGML_ASSERT(tensor->op == GGML_OP_NONE);
int64_t n_elems = ggml_nelements(tensor);
int64_t groups = n_elems / QK4_0;
@@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
*/
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
const ggml_tensor* tensor, void* src, void* dst) {
- GGML_ASSERT(tensor->op == GGML_OP_NONE);
int64_t n_elems = ggml_nelements(tensor);
int64_t groups = n_elems / QK4_0;
@@ -898,11 +896,10 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
* @param size Size of the data to be copied, in bytes.
*/
GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
- ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data,
+ ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
size_t offset, size_t size) {
- // GGML_ASSERT(size == ggml_nbytes(tensor));
- ggml_backend_cann_buffer_context* ctx =
- (ggml_backend_cann_buffer_context*)buffer->context;
+ ggml_backend_cann_buffer_context *ctx =
+ (ggml_backend_cann_buffer_context *)buffer->context;
ggml_cann_set_device(ctx->device);
// TODO: refer to cann(#6017), it use thread's default stream.
@@ -910,22 +907,21 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
// Why aclrtSynchronizeDevice?
if (!need_transform(tensor->type)) {
- ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset,
- size, ACL_MEMCPY_HOST_TO_DEVICE));
+ ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
+ ACL_MEMCPY_HOST_TO_DEVICE));
} else {
- void* transform_buffer = malloc(size);
- ggml_backend_cann_transform(tensor, (const char*)data + offset,
- transform_buffer);
+ void *transform_buffer = malloc(size);
+ ggml_backend_cann_transform(tensor, data, transform_buffer);
#ifndef NDEBUG
- void* check_buffer = malloc(size);
+ void *check_buffer = malloc(size);
ggml_backend_cann_transform_back(tensor, transform_buffer,
check_buffer);
- GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) ==
- 0);
+ GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
free(check_buffer);
#endif
- ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size,
+ ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
+ transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE));
free(transform_buffer);
}
@@ -947,21 +943,20 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
size_t offset, size_t size) {
- GGML_ASSERT(size == ggml_nbytes(tensor));
ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context;
ggml_cann_set_device(ctx->device);
if (!need_transform(tensor->type)) {
- ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size,
+ ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
ACL_MEMCPY_DEVICE_TO_HOST));
} else {
void* transform_buffer = malloc(size);
- ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size,
+ ACL_CHECK(aclrtMemcpy(transform_buffer, size,
+ (char*)tensor->data + offset, size,
ACL_MEMCPY_DEVICE_TO_HOST));
- ggml_backend_cann_transform_back(tensor, transform_buffer,
- (char*)data + offset);
+ ggml_backend_cann_transform_back(tensor, transform_buffer, data);
free(transform_buffer);
}
}
@@ -1450,42 +1445,41 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
* @param size Size of the data to copy in bytes.
*/
GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
- ggml_tensor* tensor,
- const void* data,
+ ggml_tensor *tensor,
+ const void *data,
size_t offset,
size_t size) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
+ ggml_backend_cann_context *cann_ctx =
+ (ggml_backend_cann_context *)backend->context;
if (!need_transform(tensor->type)) {
- ACL_CHECK(aclrtMemcpyAsync(
- tensor->data, size, (const char*)data + offset, size,
- ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
+ ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
+ size, ACL_MEMCPY_HOST_TO_DEVICE,
+ cann_ctx->stream()));
} else {
- void* transform_buffer = malloc(size);
- ggml_backend_cann_transform(tensor, (const char*)data + offset,
- transform_buffer);
+ void *transform_buffer = malloc(size);
+ ggml_backend_cann_transform(tensor, data, transform_buffer);
#ifndef NDEBUG
- void* check_buffer = malloc(size);
+ void *check_buffer = malloc(size);
ggml_backend_cann_transform_back(tensor, transform_buffer,
check_buffer);
- GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size));
+ GGML_ASSERT(memcmp(data, check_buffer, size));
free(check_buffer);
#endif
- ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size,
- ACL_MEMCPY_HOST_TO_DEVICE,
- cann_ctx->stream()));
+ ACL_CHECK(aclrtMemcpyAsync(
+ (char *)tensor->data + offset, size, transform_buffer, size,
+ ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
free(transform_buffer);
}
}
GGML_CALL static void ggml_backend_cann_get_tensor_async(
- ggml_backend_t backend, const ggml_tensor* tensor, void* data,
+ ggml_backend_t backend, const ggml_tensor *tensor, void *data,
size_t offset, size_t size) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
+ ggml_backend_cann_context *cann_ctx =
+ (ggml_backend_cann_context *)backend->context;
ggml_backend_buffer_t buf =
tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -1493,17 +1487,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
"unsupported buffer type");
if (!need_transform(tensor->type)) {
- ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data,
+ ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
size, ACL_MEMCPY_DEVICE_TO_HOST,
cann_ctx->stream()));
} else {
- void* transform_buffer = malloc(size);
- ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size,
- ACL_MEMCPY_DEVICE_TO_HOST,
- cann_ctx->stream()));
+ void *transform_buffer = malloc(size);
+ ACL_CHECK(aclrtMemcpyAsync(
+ transform_buffer, size, (char *)tensor->data + offset, size,
+ ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
- ggml_backend_cann_transform_back(tensor, transform_buffer,
- (char*)data + offset);
+ ggml_backend_cann_transform_back(tensor, transform_buffer, data);
free(transform_buffer);
}
}
@@ -1559,23 +1552,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
return false;
}
+ // need open both directions for memcpyasync between devices.
+ ggml_cann_set_device(cann_ctx_dst->device);
+ ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
ggml_cann_set_device(cann_ctx_src->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
ACL_MEMCPY_DEVICE_TO_DEVICE,
- cann_ctx_dst->stream()));
-
- // record event on src stream
- if (!cann_ctx_src->copy_event) {
- ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event));
- }
-
- ACL_CHECK(
- aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
+ cann_ctx_src->stream()));
- // wait on dst stream for the copy to complete
- ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(),
- cann_ctx_src->copy_event));
+ //TODO: workaround for Event didn`t work here.
+ aclrtSynchronizeStream(cann_ctx_src->stream());
} else {
// src and dst are on the same backend
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
@@ -1671,10 +1659,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
}
case GGML_OP_MUL_MAT: {
switch (op->src[0]->type) {
- // case GGML_TYPE_Q4_0:
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_Q8_0:
+ // TODO: fix me
+ // Current groupsize should not be greater than k-1 in
+ // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
+ case GGML_TYPE_Q4_0:
return true;
default:
return false;
@@ -1699,6 +1690,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q4_0:
return true;
default:
return false;
@@ -1763,8 +1755,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
*
* This function determines whether the CANN backend supports the given backend
* buffer type by comparing the device context of the backend and buffer type.
- * It returns true if the device associated with the buffer type matches the
- * device associated with the backend.
+ * It returns true if the devices are same between the backend context and
+ * buffer type context.
*
* @param backend Pointer to the CANN backend.
* @param buft Pointer to the backend buffer type to check.
@@ -1773,9 +1765,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
*/
GGML_CALL static bool ggml_backend_cann_supports_buft(
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
- return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
-
- GGML_UNUSED(backend);
+ if (ggml_backend_buft_is_cann(buft)) {
+ ggml_backend_cann_context * cann_ctx =
+ (ggml_backend_cann_context *)backend->context;
+ ggml_backend_cann_buffer_type_context * buft_ctx =
+ (ggml_backend_cann_buffer_type_context *)buft->context;
+ return buft_ctx->device == cann_ctx->device;
+ }
+ return false;
}
/**
@@ -1874,7 +1871,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
(aclrtEvent)event->context));
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp
index 960ce9a0..d120ce6a 100644
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
return ACL_INT16;
case GGML_TYPE_I32:
return ACL_INT32;
+ case GGML_TYPE_Q4_0:
+ return ACL_INT4;
+ case GGML_TYPE_Q8_0:
+ return ACL_INT8;
default:
return ACL_DT_UNDEFINED;
}
@@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
return false;
}
-aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
- size_t type_size, int64_t* ne, size_t* nb,
- int64_t dims, aclFormat format,
- size_t offset) {
- int64_t tmp_ne[GGML_MAX_DIMS * 2];
- int64_t tmp_stride[GGML_MAX_DIMS * 2];
-
- memcpy(tmp_ne, ne, dims * sizeof(int64_t));
- for (int i = 0; i < dims; i++) {
- tmp_stride[i] = nb[i] / type_size;
- }
-
- std::reverse(tmp_ne, tmp_ne + dims);
- std::reverse(tmp_stride, tmp_stride + dims);
-
- int64_t acl_storage_len = 0;
- for (int i = 0; i < dims; i++) {
- acl_storage_len += (ne[i] - 1) * nb[i];
- }
-
- aclTensor* acl_tensor =
- aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
- format, &acl_storage_len, 1, data_ptr);
-
- return acl_tensor;
-}
-
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
const ggml_tensor* src1,
int64_t* bcast_src0_ne,
diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h
index 7d0bf04e..4734a9cb 100644
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -23,6 +23,9 @@
#ifndef CANN_ACL_TENSOR_H
#define CANN_ACL_TENSOR_H
+#include <algorithm>
+#include <cstring>
+
#include <aclnn/aclnn_base.h>
#include "common.h"
@@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
size_t offset = 0);
/**
- * @brief Creates an ACL tensor from provided parameters.
+ * @brief Template for creating an ACL tensor from provided parameters. typename TYPE
+ * should be size_t or float.
*
* @details This function creates an ACL tensor using the provided data pointer,
* data type, dimensions, strides, format, offset, and additional parameters.
@@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
* @return Pointer to the created ACL tensor.
*/
+template<typename TYPE>
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
- size_t type_size, int64_t* ne, size_t* nb,
- int64_t dims, aclFormat format = ACL_FORMAT_ND,
- size_t offset = 0);
+ TYPE type_size, int64_t* ne, TYPE* nb,
+ int64_t dims,
+ aclFormat format = ACL_FORMAT_ND,
+ size_t offset = 0) {
+ int64_t tmp_ne[GGML_MAX_DIMS * 2];
+ int64_t tmp_stride[GGML_MAX_DIMS * 2];
+
+ memcpy(tmp_ne, ne, dims * sizeof(int64_t));
+ for (int i = 0; i < dims; i++) {
+ tmp_stride[i] = nb[i] / type_size;
+ }
+
+ std::reverse(tmp_ne, tmp_ne + dims);
+ std::reverse(tmp_stride, tmp_stride + dims);
+
+ int64_t acl_storage_len = 0;
+ for (int i = 0; i < dims; i++) {
+ acl_storage_len += (ne[i] - 1) * nb[i];
+ }
+
+ aclTensor* acl_tensor =
+ aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
+ format, &acl_storage_len, 1, data_ptr);
+
+ return acl_tensor;
+}
/**
* @brief Checks if tensors require broadcasting based on their shapes.
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index a02efc82..8c4132f5 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclTensor* acl_src = ggml_cann_create_tensor(src);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
- const float eps = 1e-6f; // TODO: make this a parameter
int n_groups = dst->op_params[0];
+ float eps;
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
+
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
@@ -844,7 +846,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_max_pool2d(ctx, dst);
break;
case GGML_OP_POOL_COUNT:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
@@ -910,6 +912,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->ne);
return;
}
+ if (dst->type == GGML_TYPE_Q4_0) {
+ aclrtlaunch_ascendc_quantize_f16_to_q4_0(
+ 24, ctx.stream(), src->data, dst->data,
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+ ((ggml_tensor*)dst->extra)->ne);
+ return;
+ }
if (dst->type == GGML_TYPE_F16) {
if (ggml_are_same_shape(src, dst)) {
cann_copy(ctx, acl_src, acl_dst);
@@ -931,9 +940,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
return;
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (dst->type == GGML_TYPE_F32) {
if (ggml_are_same_shape(src, dst)) {
@@ -955,12 +964,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
return;
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// TODO
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
} else if (src->type == GGML_TYPE_F32) {
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
// && nb0 == type_size)
@@ -971,6 +980,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->ne);
return;
}
+ if (dst->type == GGML_TYPE_Q4_0) {
+ aclrtlaunch_ascendc_quantize_f32_to_q4_0(
+ 24, ctx.stream(), src->data, dst->data,
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+ ((ggml_tensor*)dst->extra)->ne);
+ return;
+ }
if (dst->type == GGML_TYPE_F32) {
if (ggml_are_same_shape(src, dst)) {
cann_copy(ctx, acl_src, acl_dst);
@@ -991,10 +1007,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
return;
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
} else {
// TODO: dst not contiguous
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
if (dst->type == GGML_TYPE_F16) {
@@ -1017,11 +1033,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
return;
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
// TODO
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
} else {
if (ggml_are_same_shape(src, dst)) {
cann_copy(ctx, acl_src, acl_dst);
@@ -1029,7 +1045,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyTensor(acl_dst));
return;
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -1312,6 +1328,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
#ifdef __cplusplus
}
#endif
+
+static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
+ ggml_tensor* dst,
+ ggml_tensor* src1,
+ aclTensor* tmp_cast_tensor,
+ aclTensor* tmp_im2col_tensor) {
+ // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
+ int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
+ size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
+ aclTensor* acl_dst =
+ ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
+
+ int64_t permute_dim[] = {0, 2, 1};
+ if (src1->type != dst->type) {
+ aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
+ } else {
+ aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
+ }
+
+ // release
+ ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void ggml_cann_im2col_1d_post_process(
+ ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
+ aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
+ const std::vector<int64_t>& im2col_op_params) {
+ // get params
+ const int64_t KH = im2col_op_params[0];
+ const int64_t KW = im2col_op_params[1];
+ const int64_t IW = im2col_op_params[2];
+ const int64_t IC = im2col_op_params[3];
+ const int64_t N = im2col_op_params[4];
+ const int64_t OH = im2col_op_params[5];
+ const int64_t OW = im2col_op_params[6];
+ const int64_t s0 = im2col_op_params[7];
+ const int64_t p0 = im2col_op_params[8];
+ const int64_t d0 = im2col_op_params[9];
+ const int64_t n_bytes_factor = im2col_op_params[10];
+
+ // Permute: [N, IC * KH * KW, OW * OH] ->
+ // [N, OW * OH * n_bytes_factor, IC * KH * KW]
+ aclTensor* tmp_permute_tensor = nullptr;
+ ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
+ tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
+ void* tmp_permute_buffer = tmp_permute_allocator.get();
+
+ int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
+ size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
+ tmp_permute_nb[0] = ggml_type_size(dst->type);
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+ tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
+ }
+
+ tmp_permute_tensor = ggml_cann_create_tensor(
+ tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
+ GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+
+ int64_t permute_dim[] = {0, 2, 1};
+ if (src1->type != dst->type) {
+ aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
+ } else {
+ aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
+ 3);
+ }
+
+ // number of times the kernel moves in W dimension
+ const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
+ size_t offset;
+ void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
+
+ // memory copy with offset to restore 1D im2col from 2d
+ if (IC > 1) {
+ offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
+ size_t size_cpy = KH * KW * ggml_type_size(dst->type);
+
+ for (int c = 0; c < IC; c++) {
+ cur_permute_buffer = (char*)tmp_permute_buffer + offset +
+ KH * KW * c * ggml_type_size(dst->type);
+ cur_dst_buffer = (char*)dst->data +
+ c * KH * KW * n_step_w * ggml_type_size(dst->type);
+
+ for (int i = 0; i < n_step_w; i++) {
+ ACL_CHECK(aclrtMemcpyAsync(
+ cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+ cur_dst_buffer =
+ (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
+ cur_permute_buffer = (char*)cur_permute_buffer +
+ KH * KW * IC * ggml_type_size(dst->type);
+ }
+ }
+ } else {
+ offset = KH * KW * n_step_w *
+ ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
+ (char*)tmp_permute_buffer + offset, offset,
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+ }
+
+ // release
+ ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
+}
+
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src0 = dst->src[0]; // kernel
ggml_tensor* src1 = dst->src[1]; // input
@@ -1320,21 +1441,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
+ GGML_TENSOR_BINARY_OP_LOCALS;
+
+ // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
+ // im2col and do post-processing to restore it to 1D.
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+ const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+ const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
- const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+ const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
- GGML_TENSOR_BINARY_OP_LOCALS;
-
- const int64_t N = is_2D ? ne13 : ne12;
- const int64_t IC = is_2D ? ne12 : ne11;
-
- const int64_t KH = is_2D ? ne01 : 1;
+ const int64_t N = ne13;
+ const int64_t IC = ne12;
+ const int64_t KH = ne01;
const int64_t KW = ne00;
+ const int64_t IW = ne10;
const int64_t OH = is_2D ? ne2 : 1;
const int64_t OW = ne1;
@@ -1342,9 +1465,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
- // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
+ // memory allocated increased to 3x when is_2D == false
+ const int64_t n_bytes_factor = is_2D ? 1 : 3;
+
+ // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
- int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
+ int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
tmp_im2col_nb[0] = ggml_type_size(src1->type);
@@ -1356,8 +1482,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
// dst.elemcount.
ggml_cann_pool_alloc im2col_allocator(
- ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
+ ctx.pool(),
+ ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
void* tmp_im2col_buffer = im2col_allocator.get();
+
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
@@ -1380,8 +1508,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
paddings, strides, tmp_im2col_tensor,
&workspaceSize, &executor));
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool());
if (workspaceSize > 0) {
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+ workspace_allocator.alloc(workspaceSize);
workspaceAddr = workspace_allocator.get();
}
@@ -1391,9 +1520,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// Cast if dst is f16.
aclTensor* tmp_cast_tensor = nullptr;
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
+ void* tmp_cast_buffer = nullptr;
if (src1->type != dst->type) {
- tmp_cast_allocator.alloc(ggml_nbytes(dst));
- void* tmp_cast_buffer = tmp_cast_allocator.get();
+ tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
+ tmp_cast_buffer = tmp_cast_allocator.get();
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
temp_cast_nb[0] = ggml_type_size(dst->type);
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
@@ -1408,24 +1538,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_type_mapping(dst->type));
}
- // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
- int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
- size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
- aclTensor* acl_dst =
- ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
-
- int64_t permute_dim[] = {0, 2, 1};
- if (src1->type != dst->type) {
- aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
+ // post-processing
+ if (is_2D) {
+ ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
+ tmp_im2col_tensor);
} else {
- aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
+ std::vector<int64_t> im2col_op_params = {
+ KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
+ ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
+ tmp_im2col_tensor, im2col_op_params);
}
// release
ACL_CHECK(aclDestroyTensor(acl_src1));
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
- ACL_CHECK(aclDestroyTensor(acl_dst));
ACL_CHECK(aclDestroyIntArray(kernel_size));
ACL_CHECK(aclDestroyIntArray(dilations));
ACL_CHECK(aclDestroyIntArray(paddings));
@@ -2219,7 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
@@ -2352,21 +2479,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
* @param dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
-static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
- ggml_tensor* dst) {
+static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
+ ggml_tensor* dst,
+ const enum ggml_type type) {
ggml_tensor* src0 = dst->src[0]; // weight
ggml_tensor* src1 = dst->src[1]; // input
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
// is regarded as batch. weight need transpose.
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
- size_t weight_elem_size = sizeof(uint8_t);
- size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
+ float weight_elem_size;
+ if (type == GGML_TYPE_Q4_0) {
+ weight_elem_size = float(sizeof(uint8_t)) / 2;
+ }
+ else if (type == GGML_TYPE_Q8_0) {
+ weight_elem_size = float(sizeof(uint8_t));
+ }
+ else {
+ GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
+ }
+ float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
+
// size of one matrix is element_size * height * width.
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
// scale stored at the end of weight. Also need transpose.
+ GGML_ASSERT(QK4_0 == QK8_0);
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
size_t scale_elem_size = sizeof(uint16_t);
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
@@ -2381,10 +2520,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
+ ggml_cann_pool_alloc input_alloctor(ctx.pool());
if (src1->type != GGML_TYPE_F16) {
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
- ggml_cann_pool_alloc input_alloctor(
- ctx.pool(), ggml_nelements(src1) * input_elem_size);
+ input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
input_buffer = input_alloctor.get();
int64_t* input_cast_ne = src1->ne;
@@ -2430,8 +2569,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
input_elem_size, input_ne, input_nb, 2);
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
- (char*)src0->data + batch0 * weight_stride, ACL_INT8,
- weight_elem_size, weight_ne, weight_nb, 2);
+ (char*)src0->data + batch0 * weight_stride,
+ ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+ weight_nb, 2);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2);
@@ -2485,14 +2625,12 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
case GGML_TYPE_F16:
ggml_cann_mat_mul_fp(ctx, dst);
break;
- // case GGML_TYPE_Q4_0:
- // ggml_cann_mul_mat_q4_0(ctx, dst);
- // break;
+ case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
- ggml_cann_mul_mat_q8_0(ctx, dst);
+ ggml_cann_mul_mat_quant(ctx, dst, type);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt
index f12a4d43..5b4fef91 100644
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -9,6 +9,7 @@ file(GLOB SRC_FILES
get_row_q8_0.cpp
quantize_f32_q8_0.cpp
quantize_f16_q8_0.cpp
+ quantize_float_to_q4_0.cpp
dup.cpp
)
@@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
${SRC_FILES}
)
-#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
+# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
index bf891475..7e153208 100644
--- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@@ -8,6 +8,8 @@
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
+#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
+#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
new file mode 100644
index 00000000..9c8c86b6
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@@ -0,0 +1,278 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+#define Group_Size 32
+
+template <typename SRC_T>
+class QUANTIZE_FLOAT_TO_Q4_0 {
+ public:
+ __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+ int64_t *input_ne_ub, size_t *input_nb_ub,
+ int64_t *output_ne_ub) {
+ // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
+ // permute=[0,0,0,0]):
+ // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
+ int64_t op_block_num = GetBlockNum();
+ int64_t op_block_idx = GetBlockIdx();
+
+ // input stride of data elements
+ for (int i = 0; i < 4; i++) {
+ input_ne[i] = input_ne_ub[i];
+ input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+ output_ne[i] = output_ne_ub[i];
+ }
+
+ // output stride of data elements
+ output_stride[0] = 1;
+ for (int i = 1; i < 4; i++) {
+ output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+ }
+
+ // scale saved one by one after data:. [group1_scale, group2_scale, ...]
+ scale_ne = input_ne;
+ scale_stride[0] = 1;
+ scale_stride[1] = input_ne[0] / Group_Size;
+ for (int i = 2; i < 4; i++) {
+ scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+ }
+
+ // split input tensor by rows.
+ uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+ dr = nr / op_block_num;
+
+ uint64_t tails = nr % op_block_num;
+ if (op_block_idx < tails) {
+ dr += 1;
+ ir = dr * op_block_idx;
+ } else {
+ ir = dr * op_block_idx + tails;
+ }
+
+ group_size_in_row = scale_stride[1];
+ int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
+ output_ne[3] * sizeof(uint8_t) / 2;
+
+ input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
+ output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+ scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
+ group_size_in_row *
+ sizeof(half)));
+
+ pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
+ pipe.InitBuffer(output_queue, BUFFER_NUM,
+ Group_Size * sizeof(int8_t) / 2);
+ pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
+ pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
+ pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
+ pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
+ pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
+ pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
+ pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
+ }
+
+ __aicore__ inline void copy_in(uint32_t offset) {
+ LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
+ DataCopy(input_local, input_gm[offset], Group_Size);
+ input_queue.EnQue(input_local);
+ }
+
+ __aicore__ inline void copy_out(uint32_t offset) {
+ // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
+ // and using DataCopyPad to avoid 32 bits align.
+ LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
+ LocalTensor<int8_t> output_int8_local =
+ output_local.ReinterpretCast<int8_t>();
+
+ DataCopyExtParams dataCopyParams;
+ dataCopyParams.blockCount = 1;
+ dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
+ DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
+
+ output_queue.FreeTensor(output_local);
+ }
+
+ __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
+ LocalTensor<float> input_local) {
+ DataCopy(cast_local, input_local, Group_Size);
+ }
+
+ __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
+ LocalTensor<half> input_local) {
+ Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
+ }
+
+ __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+ const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+ const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+ const int64_t i1 =
+ row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+ const int64_t input_offset = i1 * input_stride[1] +
+ i2 * input_stride[2] +
+ i3 * input_stride[3] + Group_Size * group;
+
+ // output_offset is stride for output_gm which datatype is int8_t and
+ // divided by 2 is needed for int4b_t.
+ const int64_t output_offset = (i1 * output_stride[1] +
+ i2 * output_stride[2] +
+ i3 * output_stride[3] +
+ Group_Size * group) / 2;
+ copy_in(input_offset);
+
+ LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
+ LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
+ LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
+ LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+ LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+ LocalTensor<float> min_local = min_queue.AllocTensor<float>();
+ LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
+ LocalTensor<half> half_local = half_queue.AllocTensor<half>();
+
+ input_to_cast(cast_local, input_local);
+
+ ReduceMax(max_local, cast_local, work_local, Group_Size);
+ ReduceMin(min_local, cast_local, work_local, Group_Size);
+ const float max_value = max_local.GetValue(0);
+ const float min_value = min_local.GetValue(0);
+ float d = max_value;
+ if (min_value < 0 && (-1 * min_value) > max_value) {
+ d = min_value;
+ }
+
+ d = d / (-8);
+ if (d != 0) {
+ Muls(cast_local, cast_local, 1.0f / d, Group_Size);
+ }
+
+ // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
+ float scalar = 8.5f;
+ Adds(cast_local, cast_local, scalar, Group_Size);
+ Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
+ scalar = 15.0f;
+ Mins(cast_local, cast_local, scalar, Group_Size);
+ scalar = -8.0f;
+ Adds(cast_local, cast_local, scalar, Group_Size);
+
+ // float->half->int4b
+ Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
+ Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
+
+ output_queue.EnQue(output_local);
+ copy_out(output_offset);
+
+ input_queue.FreeTensor(input_local);
+ work_queue.FreeTensor(work_local);
+ max_queue.FreeTensor(max_local);
+ min_queue.FreeTensor(min_local);
+ int8_queue.FreeTensor(int8_local);
+ half_queue.FreeTensor(half_local);
+ cast_queue.FreeTensor(cast_local);
+ return (half)d;
+ }
+
+ __aicore__ inline void calculate() {
+ LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+ uint32_t scale_local_offset = 0;
+ uint32_t scale_global_offset = 0;
+ for (int64_t i = ir; i < ir + dr; i++) {
+ for (int64_t j = 0; j < group_size_in_row; j++) {
+ half scale = calculate_group(i, j);
+ scale_local.SetValue(scale_local_offset++, scale);
+ // Copy Group_Size/2 length data each time.
+ if (scale_local_offset == Group_Size / 2) {
+ scale_local_offset = 0;
+ // TODO: OPTIMIZE ME
+ pipe_barrier(PIPE_ALL);
+ DataCopy(scale_gm[scale_global_offset], scale_local,
+ Group_Size / 2);
+ pipe_barrier(PIPE_ALL);
+ scale_global_offset += Group_Size / 2;
+ }
+ }
+ }
+
+ if (scale_local_offset != 0) {
+ pipe_barrier(PIPE_ALL);
+ DataCopyExtParams dataCopyParams;
+ dataCopyParams.blockCount = 1;
+ dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+ DataCopyPad(scale_gm[scale_global_offset], scale_local,
+ dataCopyParams);
+ pipe_barrier(PIPE_ALL);
+ }
+ scale_queue.FreeTensor(scale_local);
+ }
+
+ private:
+ int64_t input_ne[4];
+ size_t input_stride[4];
+
+ int64_t *scale_ne;
+ size_t scale_stride[4];
+
+ int64_t output_ne[4];
+ size_t output_stride[4];
+
+ int64_t group_size_in_row;
+
+ int64_t ir;
+ int64_t dr;
+
+ TPipe pipe;
+ GlobalTensor<SRC_T> input_gm;
+ GlobalTensor<half> scale_gm;
+ GlobalTensor<int8_t> output_gm;
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+ TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
+ TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
+ TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
+ TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
+ TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
+ TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
+ TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+ auto gm_ptr = (__gm__ uint8_t *)gm;
+ auto ub_ptr = (uint8_t *)(ub);
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+ *ub_ptr = *gm_ptr;
+ }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+ int64_t input_ne_ub[4];
+ size_t input_nb_ub[4];
+ int64_t output_ne_ub[4];
+
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+ QUANTIZE_FLOAT_TO_Q4_0<half> op;
+ op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+ op.calculate();
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+ int64_t input_ne_ub[4];
+ size_t input_nb_ub[4];
+ int64_t output_ne_ub[4];
+
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+ QUANTIZE_FLOAT_TO_Q4_0<float> op;
+ op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+ op.calculate();
+}
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index b58e3e4c..57fdeb82 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -26,7 +26,11 @@ typedef half2 ggml_half2;
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA)
+#if defined(GGML_COMMON_DECL_MUSA)
+#include <musa_fp16.h>
+#else
#include <cuda_fp16.h>
+#endif
#include <cstdint>
typedef half ggml_half;
@@ -527,7 +531,7 @@ static_assert(sizeof(block_iq6_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
-#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
+#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
#include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 7641d5b5..f594cd26 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace
- GGML_ASSERT(!"CUDA error");
+ GGML_ABORT("CUDA error");
}
// this is faster on Windows
@@ -130,7 +130,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
}
return res;
#else
+
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+ cudaError_t err;
+ if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
+ {
+ err = cudaMallocManaged(ptr, size);
+ }
+ else
+ {
+ err = cudaMalloc(ptr, size);
+ }
+ return err;
+#else
return cudaMalloc(ptr, size);
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+
#endif
}
@@ -167,7 +182,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
CUdevice device;
CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -179,7 +194,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
}
-#endif // !defined(GGML_USE_HIPBLAS)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
info.devices[id].vmm = !!device_vmm;
cudaDeviceProp prop;
@@ -315,7 +330,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
};
// pool with virtual memory
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@@ -409,14 +424,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
}
};
-#endif // !defined(GGML_USE_HIPBLAS)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
}
-#endif
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
}
@@ -1341,7 +1356,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = dstDevice;
@@ -1355,7 +1370,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
GGML_UNUSED(dstDevice);
GGML_UNUSED(srcDevice);
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
-#endif // !defined(GGML_USE_HIPBLAS)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
}
static void ggml_cuda_op_mul_mat(
@@ -1486,7 +1501,7 @@ static void ggml_cuda_op_mul_mat(
}
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
- if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
+ if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
@@ -1596,7 +1611,7 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (quantize_src1 && !src1_is_contiguous) {
@@ -1828,6 +1843,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
}
}
#else
+#ifdef GGML_USE_MUSA
+ GGML_ASSERT(false);
+#else // !GGML_USE_MUSA
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx
@@ -1870,6 +1888,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
+#endif // GGML_USE_MUSA
#endif
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
@@ -1881,10 +1900,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
- bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
+ bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
- && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
- && src1->ne[1] == 1;
+ && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
@@ -2340,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
}
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
- GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
-
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
- if (!ggml_backend_buffer_is_cuda(src->buffer)) {
+ if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
return false;
}
- if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
+ if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
return false;
}
- // device -> device
+ // device -> device copy
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
- if (backend_src != backend_dst) {
- ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
- ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
- GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
- GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
+ if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
+#ifndef NDEBUG
+ GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
+#endif
+ return false;
+ }
+ if (backend_src != backend_dst) {
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
} else {
#ifdef GGML_CUDA_NO_PEER_COPY
return false;
@@ -2375,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
#endif
}
- // record event on src stream
+ // record event on src stream after the copy
if (!cuda_ctx_src->copy_event) {
ggml_cuda_set_device(cuda_ctx_src->device);
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
@@ -2387,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
} else {
// src and dst are on the same backend
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
}
return true;
}
@@ -2724,11 +2744,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_MUL_MAT_ID:
{
struct ggml_tensor * a = op->src[0];
- if (op->op == GGML_OP_MUL_MAT) {
- struct ggml_tensor * b = op->src[1];
- if (a->ne[3] != b->ne[3]) {
- return false;
- }
+ struct ggml_tensor * b = op->src[1];
+ if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
+ return false;
+ }
+ if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
+ return false;
}
switch (a->type) {
case GGML_TYPE_F32:
@@ -2867,7 +2888,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
return true;
case GGML_OP_FLASH_ATTN_EXT:
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
- return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
+ return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
#else
if (op->src[0]->ne[0] == 128) {
return true;
@@ -2953,7 +2974,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
#endif
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -3035,7 +3056,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
return false;
}
-#if CUDART_VERSION >= 11100
+#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) {
// clear the error
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 15757ca1..607ded85 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
} else if (order == GGML_SORT_ORDER_DESC) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
index 76cc01b2..62d115f1 100644
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast(
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 07a53bcd..9aff6c13 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -12,6 +12,10 @@
#else
#define GGML_COMMON_DECL_CUDA
#define GGML_COMMON_IMPL_CUDA
+#if defined(GGML_USE_MUSA)
+#define GGML_COMMON_DECL_MUSA
+#define GGML_COMMON_IMPL_MUSA
+#endif
#endif
#include "ggml-common.h"
@@ -23,111 +27,11 @@
#include <vector>
#if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F HIPBLAS_R_16F
-#define CUDA_R_32F HIPBLAS_R_32F
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-#define cublasCreate hipblasCreate
-#define cublasDestroy hipblasDestroy
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
-#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEventSynchronize hipEventSynchronize
-#define cudaEvent_t hipEvent_t
-#define cudaEventDestroy hipEventDestroy
-#define cudaFree hipFree
-#define cudaFreeHost hipHostFree
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#define cudaHostRegister hipHostRegister
-#define cudaHostRegisterPortable hipHostRegisterPortable
-#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
-#define cudaHostUnregister hipHostUnregister
-#define cudaLaunchHostFunc hipLaunchHostFunc
-#define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaMemGetInfo hipMemGetInfo
-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamDestroy hipStreamDestroy
-#define cudaStreamFireAndForget hipStreamFireAndForget
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamPerThread hipStreamPerThread
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define __trap() do { abort(); __builtin_unreachable(); } while(0)
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#include "vendors/hip.h"
+#elif defined(GGML_USE_MUSA)
+#include "vendors/musa.h"
#else
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-
-#if CUDART_VERSION < 11020
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-#endif // CUDART_VERSION < 11020
-
+#include "vendors/cuda.h"
#endif // defined(GGML_USE_HIPBLAS)
#define STRINGIZE_IMPL(...) #__VA_ARGS__
@@ -168,7 +72,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
-#if CUDART_VERSION >= 12000
+#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
static const char * cublas_get_error_str(const cublasStatus_t err) {
return cublasGetStatusString(err);
}
@@ -200,7 +104,7 @@ static const char * cu_get_error_str(CUresult err) {
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif
-#if CUDART_VERSION >= 11100
+#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else
#define GGML_CUDA_ASSUME(x)
@@ -212,93 +116,7 @@ typedef half2 dfloat2;
#else
typedef float dfloat; // dequantize float
typedef float2 dfloat2;
-#endif //GGML_CUDA_F16
-
-#if defined(GGML_USE_HIPBLAS)
-#define __CUDA_ARCH__ 1300
-
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
- defined(__gfx1150__) || defined(__gfx1151__)
-#define RDNA3
-#endif
-
-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
- defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
-#define RDNA2
-#endif
-
-#if defined(__gfx1010__) || defined(__gfx1012__)
-#define RDNA1
-#endif
-
-#ifndef __has_builtin
- #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
- const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
- const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
- const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
- return reinterpret_cast<const int &>(c);
-#else
- int8x4_t c;
- int16_t tmp;
-#pragma unroll
- for (int i = 0; i < 4; i++) {
- tmp = va[i] - vb[i];
- if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
- if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
- c[i] = tmp;
- }
- return reinterpret_cast<int &>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __vsub4(const int a, const int b) {
- return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
- const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
- const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
- unsigned int c;
- uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
- for (int i = 0; i < 4; ++i) {
- vc[i] = va[i] == vb[i] ? 0xff : 0x00;
- }
- return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
- const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
- const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
- unsigned int c;
- uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
- for (int i = 0; i < 4; ++i) {
- vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
- }
- return c;
-}
-
-#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
-// __shfl_xor() for half2 was added in ROCm 5.6
-static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
- typedef union half2_b32 {
- half2 val;
- int b32;
- } half2_b32_t;
- half2_b32_t tmp;
- tmp.val = var;
- tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
- return tmp.val;
-}
-#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
-#endif // defined(GGML_USE_HIPBLAS)
+#endif // GGML_CUDA_F16
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#define FP16_AVAILABLE
@@ -348,7 +166,7 @@ static __device__ void no_device_code(
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
-#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
@@ -455,7 +273,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
-#endif // CUDART_VERSION < 12000
+#endif // CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index f76c80dc..70305404 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -162,7 +162,6 @@ static __global__ void dequantize_block_iq2_tn(const void * __restrict__ vx, dst
const int64_t tid = threadIdx.x;
const int64_t n = tid/32;
const int64_t l = tid - 32*n;
- const int64_t is = 8*n + l/16;
const uint8_t q = x[i].qs[32*n + l];
dst_t * y = yy + i*QK_K + 128*n;
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 3db57034..aad34bfe 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
} else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
} else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu
index 174489e0..96a5adef 100644
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@@ -500,7 +500,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
}
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const dim3 block_nums(block_num_y, 1, 1);
@@ -510,7 +510,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
}
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -519,7 +519,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
}
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -528,7 +528,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
}
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -537,7 +537,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
}
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -588,7 +588,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
}
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+ GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
@@ -672,3 +672,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
GGML_UNUSED(src1_ncols);
GGML_UNUSED(src1_padded_row_size);
}
+
+bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
+ return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
+ src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
+ src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
+ src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
+ src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
+ src0_type == GGML_TYPE_F16;
+}
diff --git a/ggml/src/ggml-cuda/dmmv.cuh b/ggml/src/ggml-cuda/dmmv.cuh
index 4c5ebd47..e727eb97 100644
--- a/ggml/src/ggml-cuda/dmmv.cuh
+++ b/ggml/src/ggml-cuda/dmmv.cuh
@@ -16,3 +16,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream);
+
+bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index f24312dd..950fd93d 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) {
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
fprintf(stderr, "By default only f16 KV cache is supported.\n");
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
} else if (D == 128) {
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
fprintf(stderr, "Supported combinations:\n");
@@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) {
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
} else {
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
fprintf(stderr, "Only f16 is supported.\n");
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
index c6c35134..1b2fd500 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break;
default: {
- GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
+ GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
index 15e22f49..f3e68dbf 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break;
default: {
- GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
+ GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index 38d30b21..29f608b0 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
} else {
@@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
// ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
// break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
@@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
return;
@@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
return;
@@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
index 55af195f..4c370323 100644
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
break;
default:
// TODO: k-quants
- fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
- GGML_ASSERT(false);
+ GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
break;
}
}
diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu
index 29721cdd..c567ad1a 100644
--- a/ggml/src/ggml-cuda/iqk_mmvq.cu
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cu
@@ -466,7 +466,6 @@ __device__ __forceinline__ float vec_dot_iq3_k_q8_1(
const int * q8;
int sumi[4] = {0, 0, 0, 0};
- uint16_t v1, v2;
int v;
for (int i = 0; i < 2; ++i) {
uint32_t vl = ql[2*i+0] | (ql[2*i+1] << 16);
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 84f6387e..78d70cd7 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q(
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index f08a4758..e8a95744 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
case GGML_TYPE_IQ4_NL:
return MMQ_Q8_1_DS_LAYOUT_D4;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
@@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
break;
default:
fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 9eb3fa4f..2586ab7e 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -165,7 +165,7 @@ static void mul_mat_vec_q_cuda(
rows_per_cuda_block = 2;
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
@@ -199,7 +199,7 @@ static void mul_mat_vec_q_cuda(
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
@@ -454,7 +454,7 @@ void ggml_cuda_op_mul_mat_vec_q(
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
index 30866d51..133e219f 100644
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -142,8 +142,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
}
}
-static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
- static const float eps = 1e-6f;
+static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
if (group_size < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
@@ -196,8 +195,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
GGML_ASSERT( dst->type == GGML_TYPE_F32);
int num_groups = dst->op_params[0];
+
+ float eps;
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
+
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
- group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
+ group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
}
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index aa7f1eff..45408ce8 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda(
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index 596fb7c1..99ec1dd9 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
attn_factor, corr_dims, freq_factors, stream
);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else {
if (src0->type == GGML_TYPE_F32) {
@@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
attn_factor, corr_dims, freq_factors, stream
);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
}
diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h
new file mode 100644
index 00000000..db9f6a16
--- /dev/null
+++ b/ggml/src/ggml-cuda/vendors/cuda.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
new file mode 100644
index 00000000..d0c37725
--- /dev/null
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F HIPBLAS_R_16F
+#define CUDA_R_32F HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+ defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#if defined(__gfx1010__) || defined(__gfx1012__)
+#define RDNA1
+#endif
+
+#ifndef __has_builtin
+ #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+ return reinterpret_cast<const int &>(c);
+#else
+ int8x4_t c;
+ int16_t tmp;
+#pragma unroll
+ for (int i = 0; i < 4; i++) {
+ tmp = va[i] - vb[i];
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+ c[i] = tmp;
+ }
+ return reinterpret_cast<int &>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __vsub4(const int a, const int b) {
+ return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+ unsigned int c;
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+ for (int i = 0; i < 4; ++i) {
+ vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+ }
+ return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+ unsigned int c;
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+ for (int i = 0; i < 4; ++i) {
+ vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+ }
+ return c;
+}
+
+#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
+// __shfl_xor() for half2 was added in ROCm 5.6
+static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
+ typedef union half2_b32 {
+ half2 val;
+ int b32;
+ } half2_b32_t;
+ half2_b32_t tmp;
+ tmp.val = var;
+ tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
+ return tmp.val;
+}
+#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
new file mode 100644
index 00000000..e50a103a
--- /dev/null
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <musa_runtime.h>
+#include <musa.h>
+#include <mublas.h>
+#include <musa_fp16.h>
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N MUBLAS_OP_N
+#define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
+#define CUDA_R_16F MUSA_R_16F
+#define CUDA_R_32F MUSA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#define cublasCreate mublasCreate
+#define cublasDestroy mublasDestroy
+#define cublasGemmEx mublasGemmEx
+#define cublasGemmBatchedEx mublasGemmBatchedEx
+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
+#define cublasHandle_t mublasHandle_t
+#define cublasSetMathMode mublasSetMathMode
+#define cublasSetStream mublasSetStream
+#define cublasSgemm mublasSgemm
+#define cublasStatus_t mublasStatus_t
+#define cublasGetStatusString mublasStatus_to_string
+#define cudaDataType_t musaDataType_t
+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
+#define cudaDeviceProp musaDeviceProp
+#define cudaDeviceSynchronize musaDeviceSynchronize
+#define cudaError_t musaError_t
+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags musaEventCreateWithFlags
+#define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventRecord musaEventRecord
+#define cudaEventSynchronize musaEventSynchronize
+#define cudaEvent_t musaEvent_t
+#define cudaEventDestroy musaEventDestroy
+#define cudaFree musaFree
+#define cudaFreeHost musaFreeHost
+#define cudaGetDevice musaGetDevice
+#define cudaGetDeviceCount musaGetDeviceCount
+#define cudaGetDeviceProperties musaGetDeviceProperties
+#define cudaGetErrorString musaGetErrorString
+#define cudaGetLastError musaGetLastError
+#define cudaHostRegister musaHostRegister
+#define cudaHostRegisterPortable musaHostRegisterPortable
+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
+#define cudaHostUnregister musaHostUnregister
+#define cudaLaunchHostFunc musaLaunchHostFunc
+#define cudaMalloc musaMalloc
+#define cudaMallocHost musaMallocHost
+#define cudaMemcpy musaMemcpy
+#define cudaMemcpyAsync musaMemcpyAsync
+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
+#define cudaMemcpy2DAsync musaMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
+#define cudaMemcpyKind musaMemcpyKind
+#define cudaMemset musaMemset
+#define cudaMemsetAsync musaMemsetAsync
+#define cudaMemGetInfo musaMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
+#define cudaSetDevice musaSetDevice
+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
+#define cudaStreamDestroy musaStreamDestroy
+#define cudaStreamFireAndForget musaStreamFireAndForget
+#define cudaStreamNonBlocking musaStreamNonBlocking
+#define cudaStreamPerThread musaStreamPerThread
+#define cudaStreamSynchronize musaStreamSynchronize
+#define cudaStreamWaitEvent musaStreamWaitEvent
+#define cudaStream_t musaStream_t
+#define cudaSuccess musaSuccess
+
+// Additional mappings for MUSA virtual memory pool
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
+#define CUdevice MUdevice
+#define CUdeviceptr MUdeviceptr
+#define CUmemAccessDesc MUmemAccessDesc
+#define CUmemAllocationProp MUmemAllocationProp
+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
+#define cuDeviceGet muDeviceGet
+#define cuDeviceGetAttribute muDeviceGetAttribute
+#define cuMemAddressFree muMemAddressFree
+#define cuMemAddressReserve muMemAddressReserve
+#define cuMemCreate muMemCreate
+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
+#define cuMemMap muMemMap
+#define cuMemRelease muMemRelease
+#define cuMemSetAccess muMemSetAccess
+#define cuMemUnmap muMemUnmap
+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
+#define cudaFuncSetAttribute musaFuncSetAttribute
+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
+#define make_cudaExtent make_musaExtent
+#define make_cudaPitchedPtr make_musaPitchedPtr
+
+// Additional mappings for MUSA graphs
+#define CUDA_SUCCESS MUSA_SUCCESS
+#define CUresult MUresult
+#define cuGetErrorString muGetErrorString
+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
+#define cudaGraphDestroy musaGraphDestroy
+#define cudaGraphExecDestroy musaGraphExecDestroy
+#define cudaGraphExec_t musaGraphExec_t
+#define cudaGraphExecUpdate musaGraphExecUpdate
+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
+#define cudaGraphGetNodes musaGraphGetNodes
+#define cudaGraphInstantiate musaGraphInstantiate
+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
+#define cudaGraphLaunch musaGraphLaunch
+#define cudaGraphNodeGetType musaGraphNodeGetType
+#define cudaGraphNode_t musaGraphNode_t
+#define cudaGraphNodeType musaGraphNodeType
+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
+#define cudaGraph_t musaGraph_t
+#define cudaKernelNodeParams musaKernelNodeParams
+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
+#define cudaStreamEndCapture musaStreamEndCapture
+
+// XXX: Clang builtins mapping
+#define __vsub4 __vsub4_musa
+#define __vcmpeq4 __vcmpeq4_musa
+#define __vcmpne4 __vcmpne4_musa
+
+#ifndef __has_builtin
+ #define __has_builtin(x) 0
+#endif
+
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+
+static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
+ return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+ unsigned int c;
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+ for (int i = 0; i < 4; ++i) {
+ vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+ }
+ return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+ unsigned int c;
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+ for (int i = 0; i < 4; ++i) {
+ vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+ }
+ return c;
+}
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index a2c8dbec..190af081 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
/**
* Converts float32 to brain16.
*
- * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
- * Subnormals shall be flushed to zero, and NANs will be quiet.
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
* This code should vectorize nicely if using modern compilers.
*/
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
@@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
- if (!(u.i & 0x7f800000)) { /* subnormal */
- h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
- return h;
- }
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}
@@ -146,6 +143,7 @@ extern "C" {
#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
+#include <sys/prctl.h>
#endif
// 16-bit float
@@ -634,21 +632,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
-#define GGML_HASHTABLE_FULL ((size_t)-1)
-#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
+// bitset
+
+static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
+#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
+#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
+
+static size_t ggml_bitset_size(size_t n) {
+ return (n + BITSET_MASK) >> BITSET_SHR;
+}
+
+static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
+ return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
+}
+
+static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
+ bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
+}
+
+static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
+ bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
+}
+
+// hash set
+
+#define GGML_HASHSET_FULL ((size_t)-1)
+#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
struct ggml_hash_set ggml_hash_set_new(size_t size);
+void ggml_hash_set_free(struct ggml_hash_set * hash_set);
-bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
+// returns the minimum size for a hash set that can hold min_sz elements
+size_t ggml_hash_size(size_t min_sz);
-// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
-size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
+// remove all elements from the hash set
+void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
-// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
-size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
+// returns true if key is in the hash set
+static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// return index, asserts if table is full
-size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
+static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// hash function for ggml_tensor
+static inline size_t ggml_hash(const struct ggml_tensor * p) {
+ // the last 4 bits are always zero due to alignment
+ return (size_t)(uintptr_t)p >> 4;
+}
+
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+ size_t h = ggml_hash(key) % hash_set->size;
+
+ // linear probing
+ size_t i = h;
+ while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
+ i = (i + 1) % hash_set->size;
+ if (i == h) {
+ // visited all hash table entries -> not found
+ return GGML_HASHSET_FULL;
+ }
+ }
+ return i;
+}
+
+static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+ size_t i = ggml_hash_find(hash_set, key);
+ return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
+}
+
+static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+ size_t h = ggml_hash(key) % hash_set->size;
+
+ // linear probing
+ size_t i = h;
+ do {
+ if (!ggml_bitset_get(hash_set->used, i)) {
+ ggml_bitset_set(hash_set->used, i);
+ hash_set->keys[i] = key;
+ return i;
+ }
+ if (hash_set->keys[i] == key) {
+ return GGML_HASHSET_ALREADY_EXISTS;
+ }
+ i = (i + 1) % hash_set->size;
+ } while (i != h);
+
+ // visited all hash table entries -> not found
+ GGML_ABORT("fatal error");
+}
+
+static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+ size_t h = ggml_hash(key) % hash_set->size;
+
+ // linear probing
+ size_t i = h;
+ do {
+ if (!ggml_bitset_get(hash_set->used, i)) {
+ ggml_bitset_set(hash_set->used, i);
+ hash_set->keys[i] = key;
+ return i;
+ }
+ if (hash_set->keys[i] == key) {
+ return i;
+ }
+ i = (i + 1) % hash_set->size;
+ } while (i != h);
+
+ // visited all hash table entries -> not found
+ GGML_ABORT("fatal error");
+}
#ifdef __cplusplus
}
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index ed5f2e34..41ac63fa 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
}
if ((a % b) != 0) {
fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
- GGML_ASSERT(!"safe_divide result would've had remainder");
+ GGML_ABORT("safe_divide result would've had remainder");
}
return a / b;
}
@@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
if (!ggml_vk_supports_op(dst)) {
fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
- GGML_ASSERT(!"unsupported op");
+ GGML_ABORT("unsupported op");
}
const int32_t ne00 = src0 ? src0->ne[0] : 0;
@@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
default:
{
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
} break;
@@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
continue;
not_implemented: {}
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
- //GGML_ASSERT(false);
+ //GGML_ABORT("fatal error");
}
// Evaluate sequence
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 7d592c22..292f9ac7 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -260,7 +260,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_COUNT
};
-struct ggml_metal_context {
+struct ggml_backend_metal_context {
int n_cb;
id<MTLDevice> device;
@@ -274,6 +274,10 @@ struct ggml_metal_context {
bool support_simdgroup_mm;
bool should_capture_next_compute;
+
+ // abort ggml_metal_graph_compute if callback returns true
+ ggml_abort_callback abort_callback;
+ void * abort_callback_data;
};
// MSL code
@@ -339,7 +343,7 @@ static void * ggml_metal_host_malloc(size_t n) {
return data;
}
-static struct ggml_metal_context * ggml_metal_init(int n_cb) {
+static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
@@ -356,7 +360,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
// Configure context
- struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+ struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
ctx->device = device;
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
ctx->queue = [ctx->device newCommandQueue];
@@ -761,7 +765,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
return ctx;
}
-static void ggml_metal_free(struct ggml_metal_context * ctx) {
+static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
@@ -827,7 +831,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
return nil;
}
-static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
+static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) {
for (size_t i = 0, n = 3; i < n; ++i) {
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
return false;
@@ -938,7 +942,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
}
static enum ggml_status ggml_metal_graph_compute(
- struct ggml_metal_context * ctx,
+ struct ggml_backend_metal_context * ctx,
struct ggml_cgraph * gf) {
@autoreleasepool {
@@ -962,7 +966,7 @@ static enum ggml_status ggml_metal_graph_compute(
NSError * error = nil;
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
- GGML_ASSERT(!"capture failed");
+ GGML_ABORT("capture failed");
}
}
@@ -971,8 +975,11 @@ static enum ggml_status ggml_metal_graph_compute(
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
command_buffer_builder[cb_idx] = command_buffer;
- // enqueue the command buffers in order to specify their execution order
- [command_buffer enqueue];
+ // always enqueue the first two command buffers
+ // enqueue all of the command buffers if we don't need to abort
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
+ [command_buffer enqueue];
+ }
}
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
@@ -1024,7 +1031,7 @@ static enum ggml_status ggml_metal_graph_compute(
if (!ggml_metal_supports_op(ctx, dst)) {
GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
- GGML_ASSERT(!"unsupported op");
+ GGML_ABORT("unsupported op");
}
if (should_capture) {
@@ -1207,7 +1214,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
- default: GGML_ASSERT(false);
+ default: GGML_ABORT("fatal error");
}
bcast_row = true;
@@ -1216,7 +1223,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
- default: GGML_ASSERT(false);
+ default: GGML_ABORT("fatal error");
}
}
@@ -1270,7 +1277,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
- default: GGML_ASSERT(false);
+ default: GGML_ABORT("fatal error");
}
[encoder setComputePipelineState:pipeline];
@@ -1526,7 +1533,7 @@ static enum ggml_status ggml_metal_graph_compute(
default:
{
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} break;
case GGML_OP_SQR:
@@ -1756,7 +1763,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F32 ].pipeline; break;
case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F32 ].pipeline; break;
case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ6_K_F32 ].pipeline; break;
- default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+ default: GGML_ABORT("MUL MAT-MAT not implemented");
}
[encoder setComputePipelineState:pipeline];
@@ -1977,7 +1984,7 @@ static enum ggml_status ggml_metal_graph_compute(
default:
{
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
- GGML_ASSERT(false && "not implemented");
+ GGML_ABORT("not implemented");
}
};
@@ -2117,7 +2124,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_K_F32 ].pipeline; break;
case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_K_F32 ].pipeline; break;
case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ6_K_F32 ].pipeline; break;
- default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
+ default: GGML_ABORT("MUL_MAT_ID not implemented");
}
[encoder setComputePipelineState:pipeline];
@@ -2332,7 +2339,7 @@ static enum ggml_status ggml_metal_graph_compute(
default:
{
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
- GGML_ASSERT(false && "not implemented");
+ GGML_ABORT("not implemented");
}
};
@@ -2443,7 +2450,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_K ].pipeline; break;
case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ6_K ].pipeline; break;
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
- default: GGML_ASSERT(false && "not implemented");
+ default: GGML_ABORT("not implemented");
}
[encoder setComputePipelineState:pipeline];
@@ -2494,10 +2501,8 @@ static enum ggml_status ggml_metal_graph_compute(
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ggml_is_contiguous(src0));
- //float eps;
- //memcpy(&eps, dst->op_params, sizeof(float));
-
- const float eps = 1e-6f; // TODO: temporarily hardcoded
+ float eps;
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
@@ -2581,13 +2586,13 @@ static enum ggml_status ggml_metal_graph_compute(
switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
- default: GGML_ASSERT(false);
+ default: GGML_ABORT("fatal error");
};
} else {
switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
- default: GGML_ASSERT(false);
+ default: GGML_ABORT("fatal error");
};
}
@@ -2664,7 +2669,7 @@ static enum ggml_status ggml_metal_graph_compute(
switch (dst->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
- default: GGML_ASSERT(false);
+ default: GGML_ABORT("fatal error");
};
[encoder setComputePipelineState:pipeline];
@@ -2821,7 +2826,7 @@ static enum ggml_status ggml_metal_graph_compute(
switch (order) {
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
- default: GGML_ASSERT(false);
+ default: GGML_ABORT("fatal error");
};
[encoder setComputePipelineState:pipeline];
@@ -2910,7 +2915,7 @@ static enum ggml_status ggml_metal_graph_compute(
{
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
- GGML_ASSERT(false && "add template specialization for this size");
+ GGML_ABORT("add template specialization for this size");
}
}
} else {
@@ -2923,7 +2928,7 @@ static enum ggml_status ggml_metal_graph_compute(
{
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
- GGML_ASSERT(false && "add template specialization for this size");
+ GGML_ABORT("add template specialization for this size");
}
}
}
@@ -3044,7 +3049,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
- default: GGML_ASSERT(false && "not implemented");
+ default: GGML_ABORT("not implemented");
};
} break;
case GGML_TYPE_F16:
@@ -3052,10 +3057,10 @@ static enum ggml_status ggml_metal_graph_compute(
switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
- default: GGML_ASSERT(false && "not implemented");
+ default: GGML_ABORT("not implemented");
};
} break;
- default: GGML_ASSERT(false && "not implemented");
+ default: GGML_ABORT("not implemented");
}
[encoder setComputePipelineState:pipeline];
@@ -3083,7 +3088,7 @@ static enum ggml_status ggml_metal_graph_compute(
default:
{
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -3094,7 +3099,9 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder endEncoding];
- [command_buffer commit];
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
+ [command_buffer commit];
+ }
});
// Wait for completion and check status of each command buffer
@@ -3114,6 +3121,23 @@ static enum ggml_status ggml_metal_graph_compute(
return GGML_STATUS_FAILED;
}
+
+ id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
+ if (!next_buffer) {
+ continue;
+ }
+
+ bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
+ if (next_queued) {
+ continue;
+ }
+
+ if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
+ GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
+ return GGML_STATUS_ABORTED;
+ }
+
+ [next_buffer commit];
}
if (should_capture) {
@@ -3417,7 +3441,7 @@ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
}
GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
- struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+ struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
ggml_metal_free(ctx);
free(backend);
}
@@ -3429,13 +3453,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
}
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+ struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
return ggml_metal_graph_compute(metal_ctx, cgraph);
}
GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
- struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+ struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
return ggml_metal_supports_op(metal_ctx, op);
}
@@ -3480,9 +3504,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
}
ggml_backend_t ggml_backend_metal_init(void) {
- struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
-
+ struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
if (ctx == NULL) {
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return NULL;
}
@@ -3504,15 +3528,24 @@ bool ggml_backend_is_metal(ggml_backend_t backend) {
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
GGML_ASSERT(ggml_backend_is_metal(backend));
- struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+ struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
}
+void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
+ GGML_ASSERT(ggml_backend_is_metal(backend));
+
+ struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+
+ ctx->abort_callback = abort_callback;
+ ctx->abort_callback_data = user_data;
+}
+
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
GGML_ASSERT(ggml_backend_is_metal(backend));
- struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+ struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
}
@@ -3520,7 +3553,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
GGML_ASSERT(ggml_backend_is_metal(backend));
- struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+ struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
ctx->should_capture_next_compute = true;
}
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 99bd682f..41362dee 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3952,7 +3952,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
float sumf = 0;
#if defined(__ARM_FEATURE_SVE)
- if (svcntb() == QK8_0) {
+ if (ggml_sve_cnt_b == QK8_0) {
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
@@ -4324,15 +4324,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#endif
for (; ib < nb; ++ib) {
- int sumi = 0;
+ int sumi0 = 0;
+ int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
const int v1 = (x[ib].qs[j] >> 4) - 8;
- sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
+ sumi0 += (v0 * y[ib].qs[j]);
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
}
+ int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
}
@@ -4613,15 +4616,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
sumf = hsum_float_8(acc) + summs;
#endif
for (; ib < nb; ++ib) {
- int sumi = 0;
+ int sumi0 = 0;
+ int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F);
const int v1 = (x[ib].qs[j] >> 4);
- sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
+ sumi0 += (v0 * y[ib].qs[j]);
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
}
+ int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
}
@@ -4967,18 +4973,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
- int sumi = 0;
+ int sumi0 = 0;
+ int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
- const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16;
- const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16;
+ const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+ const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
- sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
+ sumi0 += (x0 * y[ib].qs[j]);
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
}
+ int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
}
@@ -5343,7 +5352,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
- int sumi = 0;
+ int sumi0 = 0;
+ int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
@@ -5352,9 +5362,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
- sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
+ sumi0 += (x0 * y[ib].qs[j]);
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
}
+ int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
}
@@ -5445,7 +5457,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
float sumf = 0;
#if defined(__ARM_FEATURE_SVE)
- if (svcntb() == QK8_0) {
+ if (ggml_sve_cnt_b == QK8_0) {
svfloat32_t sumv0 = svdup_n_f32(0.0f);
svfloat32_t sumv1 = svdup_n_f32(0.0f);
@@ -6591,22 +6603,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
// compute mask for subtraction
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
- vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
+ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
m <<= 1;
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
- vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
+ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
m <<= 1;
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
- vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
+ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
m <<= 1;
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
- vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
+ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
m <<= 1;
// load Q8 and take product with Q3
@@ -7862,13 +7874,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
- vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
+ vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
m <<= 1;
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
- vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
+ vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
m <<= 1;
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
@@ -12851,7 +12863,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n");
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
q2[2*ib+1] |= (block_signs[k] << 7*k);
@@ -13030,7 +13042,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n");
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
q2[2*ib+k] = grid_index | (block_signs[k] << 9);
}
@@ -13473,7 +13485,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
printf("\n");
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (grid_size == 256) {
q3[8*ib+k] = grid_index;
@@ -13686,7 +13698,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
printf("\n");
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
qs[k] = grid_index & 255;
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
@@ -14662,7 +14674,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n");
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int i8 = 2*ib + k;
y[ibl].qs[i8] = grid_index & 255;
@@ -14782,7 +14794,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
}
if (nbytes % ggml_type_size(type) != 0) {
- fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
+ fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
return false;
}
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 91063633..775aa875 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -146,6 +146,10 @@ void iq2xs_free_impl(enum ggml_type type);
void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size);
+#if defined(__ARM_FEATURE_SVE)
+extern int ggml_sve_cnt_b;
+#endif
+
#ifdef __cplusplus
}
#endif
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index b01ad267..7757615f 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
return nullptr;
}
+ if (inet_addr(host) == INADDR_NONE) {
+ fprintf(stderr, "Invalid host address: %s\n", host);
+ return nullptr;
+ }
struct sockaddr_in serv_addr;
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = inet_addr(host);
@@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
return nullptr;
}
+
+ // require that the tensor data does not go beyond the buffer end
+ uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+ uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+ uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+ GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+ GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+
result->op = (ggml_op) tensor->op;
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
result->op_params[i] = tensor->op_params[i];
@@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
uint64_t offset;
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
- size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
+ const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead(),
@@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
return false;
}
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
+
+ // sanitize tensor->data
+ {
+ const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
+ const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
+
+ if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
+ GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+ }
+ }
+
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
ggml_backend_tensor_set(tensor, data, offset, size);
ggml_free(ctx);
@@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint
return false;
}
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
+
+ // sanitize tensor->data
+ {
+ const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
+ const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
+
+ if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
+ GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+ }
+ }
+
// output serialization format: | data (size bytes) |
output.resize(size, 0);
ggml_backend_tensor_get(tensor, output.data(), offset, size);
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 36518ff9..d8eb86c2 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
});
});
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
// GGML_SYCL_DEBUG("current device index %d\n", id);
src_ptr = (char *) extra->data_device[id];
} else {
- // GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
- GGML_ASSERT(false);
+ // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n");
+ GGML_ABORT("fatal error");
}
char * dst_ptr = (char *) dst;
@@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
default:
// TODO: k-quants
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
@@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
case GGML_TYPE_Q6_K:
return 64;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -3101,7 +3101,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
@@ -3896,7 +3896,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
} else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
(void) dst;
@@ -3981,6 +3981,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
ggml_sycl_func_t func;
switch (tensor->op) {
+ case GGML_OP_CONV_TRANSPOSE_1D:
+ func = ggml_sycl_op_conv_transpose_1d;
+ break;
case GGML_OP_REPEAT:
func = ggml_sycl_repeat;
break;
@@ -4105,6 +4108,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
case GGML_OP_ARGSORT:
func = ggml_sycl_argsort;
break;
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ func = ggml_sycl_op_timestep_embedding;
+ break;
default:
return false;
}
@@ -5090,6 +5096,15 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
switch (op->op) {
+ case GGML_OP_CONV_TRANSPOSE_1D:
+ {
+ ggml_type src0_type = op->src[0]->type;
+ ggml_type src1_type = op->src[1]->type;
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+ return true;
+ }
+ return false;
+ } break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_GELU:
@@ -5213,6 +5228,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_LEAKY_RELU:
+ case GGML_OP_TIMESTEP_EMBEDDING:
return true;
default:
return false;
diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
index 067181de..58dd9c9a 100644
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -15,6 +15,7 @@
#include "concat.hpp"
#include "common.hpp"
+#include "conv.hpp"
#include "convert.hpp"
#include "dequantize.hpp"
#include "dmmv.hpp"
@@ -23,5 +24,6 @@
#include "rope.hpp"
#include "norm.hpp"
#include "softmax.hpp"
+#include "tsembd.hpp"
#endif // GGML_SYCL_BACKEND_HPP
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 397bd98d..86d8b40e 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -100,7 +100,7 @@ static void crash() {
const char* msg) {
fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
- GGML_ASSERT(!"SYCL error");
+ GGML_ABORT("SYCL error");
}
#define SYCL_CHECK(err) \
diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp
new file mode 100644
index 00000000..bc4ab1dd
--- /dev/null
+++ b/ggml/src/ggml-sycl/conv.cpp
@@ -0,0 +1,99 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "conv.hpp"
+
+static void conv_transpose_1d_kernel(
+ const int s0, const int output_size,
+ const int src0_ne0, const int src0_ne1, const int src0_ne2,
+ const int src1_ne0, const int dst_ne0,
+ const float * src0, const float * src1, float * dst,
+ const sycl::nd_item<3> &item_ct1) {
+ int global_index = item_ct1.get_local_id(2) +
+ item_ct1.get_group(2) * item_ct1.get_local_range(2);
+ if (global_index >= output_size) {
+ return;
+ }
+
+ int out_index = global_index / dst_ne0;
+
+ float accumulator = 0;
+
+ for (int c = 0; c < src0_ne2; c++) {
+ int idx = global_index % dst_ne0;
+
+ int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
+ int input_offset = src1_ne0 * c;
+
+ for (int i = 0; i < src1_ne0; i++) {
+ if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
+ continue;
+ }
+ int weight_idx = idx - i*s0;
+
+ float kernel_weight = src0[kernel_offset + weight_idx];
+ float input_value = src1[input_offset+i];
+
+ accumulator += kernel_weight * input_value;
+ }
+ }
+ dst[global_index] = accumulator;
+}
+
+static void conv_transpose_1d_f32_f32_sycl(
+ const int s0, const int output_size,
+ const int src0_ne0, const int src0_ne1, const int src0_ne2,
+ const int src1_ne0, const int dst_ne0,
+ const float *src0, const float *src1, float *dst,
+ const queue_ptr& stream) {
+
+ const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
+ const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
+ const sycl::range<3> block_nums(1, 1, num_blocks);
+ stream->parallel_for(
+ sycl::nd_range<3>(
+ block_nums * block_dims, block_dims),
+ [=](sycl::nd_item<3> item_ct1) {
+ conv_transpose_1d_kernel(
+ s0, output_size,
+ src0_ne0, src0_ne1, src0_ne2,
+ src1_ne0, dst_ne0,
+ src0, src1, dst, item_ct1);
+ });
+}
+
+void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+ const ggml_tensor *src1, ggml_tensor *dst) {
+ const float * src0_d = (const float *)src0->data;
+ const float * src1_d = (const float *)src1->data;
+
+ float * dst_d = (float *)dst->data;
+ dpct::queue_ptr stream = ctx.stream();
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+ GGML_ASSERT(ggml_is_contiguous(src0));
+ GGML_ASSERT(ggml_is_contiguous(src1));
+
+ const int32_t * opts = (const int32_t *)dst->op_params;
+
+ const int s0 = opts[0];
+
+ const int64_t output_size = ggml_nelements(dst);
+
+ conv_transpose_1d_f32_f32_sycl(s0, output_size,
+ src0->ne[0], src0->ne[1], src0->ne[2],
+ src1->ne[0], dst->ne[0],
+ src0_d, src1_d, dst_d, stream);
+}
+
diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp
new file mode 100644
index 00000000..eb20730f
--- /dev/null
+++ b/ggml/src/ggml-sycl/conv.hpp
@@ -0,0 +1,21 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_CONV_HPP
+#define GGML_SYCL_CONV_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+ const ggml_tensor *src1, ggml_tensor *dst);
+
+#endif // GGML_SYCL_CONV_HPP
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
index 70a94fc1..ae45630e 100644
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
break;
default:
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 4aaa76bf..fe4a8f74 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -874,7 +874,7 @@ namespace dpct
inline std::string get_preferred_gpu_platform_name() {
std::string result;
- std::string filter = "level-zero";
+ std::string filter = "";
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
if (env) {
if (std::strstr(env, "level_zero")) {
@@ -892,11 +892,24 @@ namespace dpct
else {
throw std::runtime_error("invalid device filter: " + std::string(env));
}
+ } else {
+ auto default_device = sycl::device(sycl::default_selector_v);
+ auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
+
+ if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
+ filter = "level-zero";
+ }
+ else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
+ filter = "cuda";
+ }
+ else if (std::strstr(default_platform_name.c_str(), "HIP")) {
+ filter = "hip";
+ }
}
- auto plaform_list = sycl::platform::get_platforms();
+ auto platform_list = sycl::platform::get_platforms();
- for (const auto& platform : plaform_list) {
+ for (const auto& platform : platform_list) {
auto devices = platform.get_devices();
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
return d.is_gpu();
@@ -975,7 +988,7 @@ namespace dpct
if (backend == "opencl:cpu") return 4;
if (backend == "opencl:acc") return 5;
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
static bool compare_backend(std::string &backend1, std::string &backend2) {
return convert_backend_index(backend1) < convert_backend_index(backend2);
diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
index 3107ba91..e952533d 100644
--- a/ggml/src/ggml-sycl/mmq.cpp
+++ b/ggml/src/ggml-sycl/mmq.cpp
@@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_0_PASCAL;
nwarps = NWARPS_Q4_0_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_1_PASCAL;
nwarps = NWARPS_Q4_1_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_0_PASCAL;
nwarps = NWARPS_Q5_0_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_1_PASCAL;
nwarps = NWARPS_Q5_1_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q8_0_PASCAL;
nwarps = NWARPS_Q8_0_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q2_K_PASCAL;
nwarps = NWARPS_Q2_K_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q3_K_PASCAL;
nwarps = NWARPS_Q3_K_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_K_PASCAL;
nwarps = NWARPS_Q4_K_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_K_PASCAL;
nwarps = NWARPS_Q5_K_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q6_K_PASCAL;
nwarps = NWARPS_Q6_K_PASCAL;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q(
ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index 3fbc4dd6..1b96925e 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -902,7 +902,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
- mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
+ mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
vx, vy, dst, ncols, nrows, item_ct1);
});
});
@@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q(
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
break;
}
}
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index cccf87d0..b3159b9d 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -225,9 +225,8 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
}
static void group_norm_f32_sycl(const float* x, float* dst,
- const int num_groups, const int group_size,
+ const int num_groups, const float eps, const int group_size,
const int ne_elements, queue_ptr stream, int device) {
- static const float eps = 1e-6f;
if (group_size < 1024) {
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
stream->submit([&](sycl::handler& cgh) {
@@ -343,8 +342,12 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
GGML_ASSERT(dst->type == GGML_TYPE_F32);
int num_groups = dst->op_params[0];
+
+ float eps;
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
+
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
- group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
+ group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
(void)src1;
(void)dst;
diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp
index 15ddcac1..340ab8e9 100644
--- a/ggml/src/ggml-sycl/presets.hpp
+++ b/ggml/src/ggml-sycl/presets.hpp
@@ -41,6 +41,8 @@
#define SYCL_ACC_BLOCK_SIZE 256
#define SYCL_IM2COL_BLOCK_SIZE 256
#define SYCL_POOL2D_BLOCK_SIZE 256
+#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
// dmmv = dequantize_mul_mat_vec
#ifndef GGML_SYCL_DMMV_X
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index 6f507941..c7545bcc 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -251,7 +251,7 @@ void ggml_sycl_op_rope(
attn_factor, corr_dims, freq_factors, main_stream
);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else {
if (src0->type == GGML_TYPE_F32) {
@@ -265,7 +265,7 @@ void ggml_sycl_op_rope(
attn_factor, corr_dims, freq_factors, main_stream
);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp
new file mode 100644
index 00000000..d5c227cd
--- /dev/null
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@@ -0,0 +1,71 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "tsembd.hpp"
+
+static void timestep_embedding_f32(
+ const float * timesteps, float * dst, const int nb1,
+ const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) {
+ // item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0]
+ // item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE
+ int i = item_ct1.get_group(1);
+ int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
+ float * embed_data = (float *)((char *)dst + i*nb1);
+
+ if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
+ embed_data[dim] = 0.f;
+ }
+
+ int half = dim / 2;
+ if (j >= half) {
+ return;
+ }
+
+ float timestep = timesteps[i];
+ float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half);
+ float arg = timestep * freq;
+ embed_data[j] = sycl::cos(arg);
+ embed_data[j + half] = sycl::sin(arg);
+}
+
+static void timestep_embedding_f32_sycl(
+ const float * x, float * dst, const int ne00, const int nb1,
+ const int dim, const int max_period, const queue_ptr& stream) {
+ // As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad
+ int half_ceil = dim / 2;
+ int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
+ sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
+ sycl::range<3> gridDim(1, ne00, num_blocks);
+ stream->parallel_for(
+ sycl::nd_range<3>(
+ gridDim * block_dims, block_dims),
+ [=](sycl::nd_item<3> item_ct1) {
+ timestep_embedding_f32(
+ x, dst, nb1, dim, max_period, item_ct1
+ );
+ });
+}
+
+void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+ const ggml_tensor *src1, ggml_tensor * dst) {
+ const float * src0_d = (const float *)src0->data;
+ float * dst_d = (float *)dst->data;
+ dpct::queue_ptr stream = ctx.stream();
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+ const int dim = dst->op_params[0];
+ const int max_period = dst->op_params[1];
+
+ timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
+}
diff --git a/ggml/src/ggml-sycl/tsembd.hpp b/ggml/src/ggml-sycl/tsembd.hpp
new file mode 100644
index 00000000..ff854c33
--- /dev/null
+++ b/ggml/src/ggml-sycl/tsembd.hpp
@@ -0,0 +1,21 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_TSEMBD_HPP
+#define GGML_SYCL_TSEMBD_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+ const ggml_tensor *src1, ggml_tensor * dst);
+
+#endif // GGML_SYCL_TSEMBD_HPP
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 6bcd81a7..86732837 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -177,24 +177,33 @@ struct vk_device_struct {
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
+ vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
vk_pipeline pipeline_mul_f32;
vk_pipeline pipeline_div_f32;
- vk_pipeline pipeline_add_f32;
+ vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
+ vk_pipeline pipeline_upscale_f32;
vk_pipeline pipeline_scale_f32;
vk_pipeline pipeline_sqr_f32;
vk_pipeline pipeline_clamp_f32;
+ vk_pipeline pipeline_pad_f32;
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
vk_pipeline pipeline_norm_f32;
+ vk_pipeline pipeline_group_norm_f32;
vk_pipeline pipeline_rms_norm_f32;
vk_pipeline pipeline_gelu_f32;
+ vk_pipeline pipeline_gelu_quick_f32;
vk_pipeline pipeline_silu_f32;
vk_pipeline pipeline_relu_f32;
+ vk_pipeline pipeline_leaky_relu_f32;
+ vk_pipeline pipeline_tanh_f32;
vk_pipeline pipeline_diag_mask_inf_f32;
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
vk_pipeline pipeline_argsort_f32;
vk_pipeline pipeline_sum_rows_f32;
+ vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
+ vk_pipeline pipeline_timestep_embedding_f32;
std::vector<vk_pipeline_ref> pipelines;
@@ -236,8 +245,8 @@ struct vk_device_struct {
};
struct vk_buffer_struct {
- vk::Buffer buffer;
- vk::DeviceMemory device_memory;
+ vk::Buffer buffer = VK_NULL_HANDLE;
+ vk::DeviceMemory device_memory = VK_NULL_HANDLE;
vk::MemoryPropertyFlags memory_property_flags;
void * ptr;
size_t size = 0;
@@ -259,6 +268,10 @@ struct vk_subbuffer {
vk_buffer buffer;
uint64_t offset;
uint64_t size;
+
+ operator vk::DescriptorBufferInfo() const {
+ return { buffer->buffer, offset, size };
+ }
};
struct vk_semaphore {
@@ -320,7 +333,7 @@ struct vk_op_binary_push_constants {
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
uint32_t d_offset;
- float param1; float param2;
+ float param1; float param2; int32_t param3;
};
struct vk_op_diag_mask_push_constants {
@@ -358,6 +371,25 @@ struct vk_op_argsort_push_constants {
int32_t order;
};
+struct vk_op_im2col_push_constants {
+ uint32_t batch_offset; uint32_t offset_delta;
+ uint32_t IC;
+ uint32_t IW; uint32_t IH;
+ uint32_t OW; uint32_t OH;
+ uint32_t KW; uint32_t KH;
+ uint32_t pelements;
+ uint32_t CHW;
+ int32_t s0; int32_t s1;
+ int32_t p0; int32_t p1;
+ int32_t d0; int32_t d1;
+};
+
+struct vk_op_timestep_embedding_push_constants {
+ uint32_t nb1;
+ uint32_t dim;
+ uint32_t max_period;
+};
+
// Allow pre-recording command buffers
struct vk_staging_memcpy {
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
@@ -367,28 +399,32 @@ struct vk_staging_memcpy {
size_t n;
};
-struct vk_context {
- size_t idx;
+struct vk_op_upscale_push_constants {
+ uint32_t ne; uint32_t d_offset;
+ uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
+ uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
+ float sf0; float sf1; float sf2; float sf3;
+};
+struct vk_context_struct {
vk_submission * s;
std::vector<vk_sequence> seqs;
- ggml_tensor * exit_tensor;
+ int exit_tensor_idx;
std::vector<vk_staging_memcpy> in_memcpys;
std::vector<vk_staging_memcpy> out_memcpys;
vk_queue * q;
};
+typedef std::shared_ptr<vk_context_struct> vk_context;
+typedef std::weak_ptr<vk_context_struct> vk_context_ref;
struct ggml_tensor_extra_gpu {
- size_t ctx_idx;
-
vk_buffer_ref buffer_gpu;
uint64_t offset;
void reset() {
- ctx_idx = 0;
buffer_gpu.reset();
offset = 0;
}
@@ -459,8 +495,10 @@ struct ggml_backend_vk_context {
vk_buffer buffer_pool[MAX_VK_BUFFERS];
- vk_context * compute_ctx;
- vk_context * transfer_ctx;
+ vk_context_ref compute_ctx;
+ vk_context_ref transfer_ctx;
+
+ std::vector<vk_context_ref> tensor_ctxs;
};
#ifdef GGML_VULKAN_MEMORY_DEBUG
@@ -510,12 +548,12 @@ static vk_instance_t vk_instance;
static size_t vk_skip_checks;
static size_t vk_output_tensor;
-static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name);
-static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor);
-static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor);
+static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
+static void ggml_vk_check_results_0(ggml_tensor * tensor);
+static void ggml_vk_check_results_1(ggml_tensor * tensor);
#endif
-typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
@@ -708,11 +746,11 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
return s;
}
-static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
- VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
+static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
if (ctx->seqs.empty()) {
return;
}
+ VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
std::vector<std::vector<uint64_t>> tl_wait_vals;
std::vector<std::vector<uint64_t>> tl_signal_vals;
@@ -844,21 +882,17 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
q.stage_flags = stage_flags;
}
-static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
- VK_LOG_DEBUG("ggml_vk_create_context()");
- ctx->gc.contexts.emplace_back();
- vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
- memset((void *) result, 0, sizeof(vk_context));
- result->idx = ctx->gc.contexts.size() - 1;
+static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
+ vk_context result = std::make_shared<vk_context_struct>();
+ VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
+ ctx->gc.contexts.emplace_back(result);
result->q = &q;
return result;
}
-static vk_context * ggml_vk_create_temporary_context(vk_queue& q) {
- VK_LOG_DEBUG("ggml_vk_create_temporary_context()");
- vk_context * result = new vk_context;
- memset((void *) result, 0, sizeof(vk_context));
- result->idx = 0;
+static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
+ vk_context result = std::make_shared<vk_context_struct>();
+ VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
result->q = &q;
return result;
}
@@ -915,6 +949,10 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
+ if (size > device->max_memory_allocation_size) {
+ throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
+ }
+
std::lock_guard<std::mutex> guard(device->mutex);
vk_buffer buf = std::make_shared<vk_buffer_struct>();
@@ -1027,21 +1065,22 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
return { buf, 0, VK_WHOLE_SIZE };
}
-static void ggml_vk_sync_buffers(vk_context * ctx) {
+static void ggml_vk_sync_buffers(vk_context& ctx) {
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
- const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
-
ctx->s->buffer.pipelineBarrier(
ctx->q->stage_flags,
ctx->q->stage_flags,
{},
- mem_barriers,
+ { {
+ {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite},
+ {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}
+ } },
{},
{}
);
}
-static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
+static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
VK_LOG_DEBUG("ggml_vk_wait_events()");
if (events.empty()) {
return;
@@ -1598,6 +1637,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@@ -1605,20 +1645,31 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
-
ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+
+ ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
+
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
@@ -1634,6 +1685,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+
+ ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
+ ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
+
+ ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
}
static vk_device ggml_vk_get_device(size_t idx) {
@@ -1961,7 +2017,7 @@ void ggml_vk_instance_init() {
// Make sure at least one device exists
if (devices.empty()) {
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// Default to using all dedicated GPUs
@@ -2057,9 +2113,9 @@ void ggml_vk_instance_init() {
}
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
- GGML_ASSERT(idx < vk_instance.device_indices.size());
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
ggml_vk_instance_init();
+ GGML_ASSERT(idx < vk_instance.device_indices.size());
ctx->name = GGML_VK_NAME + std::to_string(idx);
@@ -2077,9 +2133,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
ctx->staging_size = 0;
ctx->staging_offset = 0;
- ctx->compute_ctx = nullptr;
- ctx->transfer_ctx = nullptr;
-
#ifdef GGML_VULKAN_CHECK_RESULTS
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -2112,7 +2165,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
}
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
- VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
return ctx->device->pipeline_matmul_f32;
}
@@ -2126,7 +2179,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
return ctx->device->pipeline_matmul_f16;
}
- GGML_ASSERT(src1_type == GGML_TYPE_F32);
+ if (src1_type != GGML_TYPE_F32) {
+ return nullptr;
+ }
switch (src0_type) {
case GGML_TYPE_Q4_0:
@@ -2370,28 +2425,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
return s;
}
-static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
+
+
+static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
- for (auto& buffer : buffers) {
- std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
+ for (auto& buffer : descriptor_buffer_infos) {
+ std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
}
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
- std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
- std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
- GGML_ASSERT(buffers.size() == pipeline->parameter_count);
- vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
- for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
- descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
- }
- for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
- write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
- }
+ GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
- ctx->device->device.updateDescriptorSets(write_descriptor_sets, {});
+ vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
+ vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
+ ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
@@ -2410,7 +2460,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
s.signal_semaphores = std::move(signal_semaphores);
}
-static void ggml_vk_ctx_end(vk_context * ctx) {
+static void ggml_vk_ctx_end(vk_context& ctx) {
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
if (ctx->s == nullptr) {
return;
@@ -2420,7 +2470,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
ctx->s = nullptr;
}
-static void ggml_vk_ctx_begin(vk_device& device, vk_context * subctx) {
+static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")");
if (subctx->s != nullptr) {
ggml_vk_ctx_end(subctx);
@@ -2453,13 +2503,13 @@ static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
}
}
-static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
+static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
GGML_ASSERT(!ggml_is_contiguous(tensor));
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// Check if src is pinned memory
vk_buffer buf;
@@ -2527,7 +2577,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
staging = ctx->device->sync_staging;
staging_offset = 0;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -2558,12 +2608,12 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
}
}
-static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
+static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// Check if src is pinned memory
vk_buffer buf = nullptr;
@@ -2602,7 +2652,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
staging_buffer = dst->device->sync_staging;
staging_offset = 0;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -2623,7 +2673,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
}
}
-static void ggml_vk_buffer_write_async(vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
+static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, staging_buffer, staging_offset, sync_staging);
}
@@ -2638,7 +2688,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
}
} else {
- vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
ggml_vk_ctx_begin(dst->device, subctx);
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, nullptr, 0, true);
ggml_vk_ctx_end(subctx);
@@ -2650,8 +2700,6 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
dst->device->device.resetFences({ dst->device->fence });
-
- delete subctx;
}
}
@@ -2660,12 +2708,14 @@ static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1);
}
-static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
+static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
GGML_ASSERT(width > 0);
GGML_ASSERT(height > 0);
GGML_ASSERT(src != nullptr);
+ // TODO: staging_offset is not used
+
// Check if dst is pinned memory
vk_buffer buf = nullptr;
size_t buf_offset;
@@ -2704,7 +2754,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
staging_buffer = src->device->sync_staging;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -2714,18 +2764,18 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
}
-static void ggml_vk_buffer_read_async(vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
+static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, vk_buffer staging_buffer, size_t staging_offset, bool sync_staging = false) {
return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, staging_buffer, staging_offset, sync_staging);
}
static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
- VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
+ VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
memcpy(dst, (uint8_t *) src->ptr + offset, size);
} else {
- vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, nullptr, 0, true);
ggml_vk_ctx_end(subctx);
@@ -2737,12 +2787,10 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
-
- delete subctx;
}
}
-static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
+static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
// Make sure both buffers are on same device
GGML_ASSERT(src->device == dst->device);
@@ -2756,15 +2804,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
if (src->device == dst->device) {
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
// Copy within the device
- vk_context * subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
src->device->device.resetFences({ src->device->fence });
-
- delete subctx;
} else {
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
// Copy device to device
@@ -2783,7 +2829,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
- vk_context * subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+ vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
ggml_vk_ctx_begin(dst->device, subctx);
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
ggml_vk_ctx_end(subctx);
@@ -2791,8 +2837,6 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
dst->device->device.resetFences({ dst->device->fence });
-
- delete subctx;
}
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
@@ -2855,7 +2899,7 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
}
static void ggml_vk_matmul(
- ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
+ ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
@@ -2879,7 +2923,7 @@ static void ggml_vk_matmul(
}
static void ggml_vk_matmul_id(
- ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline,
+ ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
@@ -2913,10 +2957,10 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
}
std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
-static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
+static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
const int tensor_type_size = ggml_type_size(tensor->type);
@@ -2934,7 +2978,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
}
-static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
@@ -3079,7 +3123,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
} else if (qx_needs_dequant) {
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
}
if (y_non_contig) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -3107,7 +3151,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
); // NOLINT
}
-static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
@@ -3268,11 +3312,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
};
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
- { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} },
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
}
-static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
@@ -3340,10 +3384,10 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
// compute
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
}
-static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
@@ -3415,10 +3459,11 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
// compute
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
}
-static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
@@ -3431,7 +3476,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
}
}
-static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
@@ -3499,7 +3544,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
if (mmp == nullptr) {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
// Not implemented
@@ -3590,7 +3635,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
} else if (qx_needs_dequant) {
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
}
if (y_non_contig) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -3618,7 +3664,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
); // NOLINT
}
-static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
@@ -3790,11 +3836,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
};
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
- { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } },
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
+ vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
}
-static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
@@ -3803,8 +3850,8 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subct
}
}
-static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
- // guaranteed to be an integer due to the check in ggml_can_repeat
+static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ VK_LOG_DEBUG("ggml_vk_op_repeat(" << src0 << ", " << src1 << ", " << dst << ")");
const uint64_t ne0 = dst->ne[0];
const uint64_t ne1 = dst->ne[1];
const uint64_t ne2 = dst->ne[2];
@@ -3825,6 +3872,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
const uint64_t nb02 = src0->nb[2];
const uint64_t nb03 = src0->nb[3];
+ // guaranteed to be an integer due to the check in ggml_can_repeat
const uint64_t nr0 = ne0/ne00;
const uint64_t nr1 = ne1/ne01;
const uint64_t nr2 = ne2/ne02;
@@ -3852,8 +3900,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
for (uint64_t k1 = 0; k1 < ne01; k1++) {
for (uint64_t i0 = 0; i0 < nr0; i0++) {
copies.push_back({
- src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
- dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
+ src_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
+ dst_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
ne00*nb0,
});
}
@@ -3874,11 +3922,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
switch (op) {
- case GGML_OP_ADD:
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
- return ctx->device->pipeline_add_f32;
- }
- return nullptr;
case GGML_OP_GET_ROWS:
GGML_ASSERT(src1->type == GGML_TYPE_I32);
if (dst->type == GGML_TYPE_F16) {
@@ -3888,6 +3931,14 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_get_rows_f32[src0->type];
}
return nullptr;
+ case GGML_OP_ADD:
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_add_f32;
+ }
+ if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+ return ctx->device->pipeline_add_f16_f32_f16;
+ }
+ return nullptr;
case GGML_OP_MUL:
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_mul_f32;
@@ -3898,6 +3949,22 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_div_f32;
}
return nullptr;
+ case GGML_OP_CONCAT:
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_concat_f32;
+ }
+ if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+ return ctx->device->pipeline_concat_f16;
+ }
+ if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+ return ctx->device->pipeline_concat_i32;
+ }
+ return nullptr;
+ case GGML_OP_UPSCALE:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_upscale_f32;
+ }
+ return nullptr;
case GGML_OP_SCALE:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_scale_f32;
@@ -3913,6 +3980,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_clamp_f32;
}
return nullptr;
+ case GGML_OP_PAD:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_pad_f32;
+ }
+ return nullptr;
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
@@ -3922,6 +3994,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_norm_f32;
}
return nullptr;
+ case GGML_OP_GROUP_NORM:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_group_norm_f32;
+ }
+ return nullptr;
case GGML_OP_RMS_NORM:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_rms_norm_f32;
@@ -3939,11 +4016,21 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_gelu_f32;
}
break;
+ case GGML_UNARY_OP_GELU_QUICK:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_gelu_quick_f32;
+ }
+ break;
case GGML_UNARY_OP_RELU:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_relu_f32;
}
break;
+ case GGML_UNARY_OP_TANH:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_tanh_f32;
+ }
+ break;
default:
break;
}
@@ -3995,6 +4082,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_sum_rows_f32;
}
return nullptr;
+ case GGML_OP_IM2COL:
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_im2col_f32;
+ }
+ if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+ return ctx->device->pipeline_im2col_f32_f16;
+ }
+ return nullptr;
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_timestep_embedding_f32;
+ }
+ return nullptr;
+ case GGML_OP_LEAKY_RELU:
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+ return ctx->device->pipeline_leaky_relu_f32;
+ }
+ return nullptr;
default:
return nullptr;
}
@@ -4018,9 +4123,12 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
return true;
default:
return false;
@@ -4028,7 +4136,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
}
template<typename PC>
-static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
+static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
if (src1 != nullptr) {
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
@@ -4078,7 +4186,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
std::cerr << " and " << ggml_type_name(src1->type);
}
std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
op_func(ctx, subctx, src0, src1, dst);
@@ -4124,7 +4232,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
vk_buffer d_D = extra->buffer_gpu.lock();
// Workaround for tiny tensor inputs on ROPE
- if (use_src1 && y_sz > d_D->size) {
+ if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
y_sz = VK_WHOLE_SIZE;
}
@@ -4173,13 +4281,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, 1);
- switch (dst->op) {
+ switch (op) {
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_SOFT_MAX:
case GGML_OP_SUM_ROWS:
- elements = { (uint32_t)ggml_nrows(src0), 1, 1 };
- break;
+ {
+ const uint32_t nr = ggml_nrows(src0);
+ if (nr > 262144) {
+ elements = { 512, 512, CEIL_DIV(nr, 262144) };
+ } else if (nr > 512) {
+ elements = { 512, CEIL_DIV(nr, 512), 1 };
+ } else {
+ elements = { nr, 1, 1 };
+ }
+ } break;
+ case GGML_OP_GROUP_NORM:
+ {
+ const uint32_t num_groups = dst->op_params[0];
+ elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 };
+ } break;
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_ROPE:
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
@@ -4190,6 +4311,49 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
case GGML_OP_ARGSORT:
elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
break;
+ case GGML_OP_IM2COL:
+ {
+ const bool is_2D = dst->op_params[6] == 1;
+
+ const uint32_t IC = src1->ne[is_2D ? 2 : 1];
+
+ const uint32_t KH = is_2D ? src0->ne[1] : 1;
+ const uint32_t KW = src0->ne[0];
+
+ const uint32_t OH = is_2D ? dst->ne[2] : 1;
+ const uint32_t OW = dst->ne[1];
+
+ const uint32_t batch = src1->ne[3];
+
+ elements = { OW * KW * KH, OH, batch * IC };
+ } break;
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ {
+ const uint32_t dim = dst->op_params[0];
+ uint32_t half_ceil = (dim + 1) / 2;
+ elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
+ } break;
+ case GGML_OP_ADD:
+ case GGML_OP_DIV:
+ case GGML_OP_MUL:
+ case GGML_OP_SCALE:
+ case GGML_OP_SQR:
+ case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
+ case GGML_OP_CPY:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
+ case GGML_OP_UNARY:
+ {
+ const uint32_t ne = ggml_nelements(dst);
+ if (ne > 262144) {
+ elements = { 512, 512, CEIL_DIV(ne, 262144) };
+ } else if (ne > 512) {
+ elements = { 512, CEIL_DIV(ne, 512), 1 };
+ } else {
+ elements = { ne, 1, 1 };
+ }
+ } break;
default:
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
break;
@@ -4216,31 +4380,35 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
if (use_src1) {
subbuf_y = { d_Y, y_buf_offset, y_sz };
} else {
- subbuf_y = { d_X, 0, d_X->size };
+ subbuf_y = { d_X, 0, x_sz };
}
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (op == GGML_OP_ROPE) {
// Empty src2 is possible in rope, but the shader needs a buffer
vk_subbuffer subbuf_z;
if (use_src2) {
subbuf_z = { d_Z, z_buf_offset, z_sz };
} else {
- subbuf_z = { d_X, 0, d_X->size };
+ subbuf_z = { d_X, 0, x_sz };
}
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ } else if (op == GGML_OP_IM2COL) {
+ // im2col uses only src1 and dst buffers
+ ggml_vk_sync_buffers(subctx);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (use_src2) {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (use_src1) {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
}
} else {
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
@@ -4249,8 +4417,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
ggml_pipeline_allocate_descriptor_sets(ctx->device, pipeline, ne02 * ne03);
- switch (dst->op) {
+ switch (op) {
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
elements = { (uint32_t)ne01, 1, 1 };
break;
@@ -4276,21 +4445,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
if (use_src1) {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
} else {
ggml_vk_sync_buffers(subctx);
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
}
}
}
}
}
-static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
+static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {});
}
-static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4301,11 +4470,11 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
- 0.0f, 0.0f,
+ 0.0f, 0.0f, 0,
});
}
-static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4316,11 +4485,11 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
- 0.0f, 0.0f,
+ 0.0f, 0.0f, 0,
});
}
-static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4331,11 +4500,11 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
- 0.0f, 0.0f,
+ 0.0f, 0.0f, 0,
});
}
-static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4346,11 +4515,44 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context * subctx, cons
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
- 0.0f, 0.0f,
+ 0.0f, 0.0f, 0,
});
}
-static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ int * op_params = (int *)dst->op_params;
+
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
+ const uint32_t src1_type_size = ggml_type_size(src1->type);
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONCAT, {
+ (uint32_t)ggml_nelements(dst),
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+ (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
+ 0,
+ 0.0f, 0.0f, op_params[0],
+ });
+}
+
+static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
+
+ const float sf0 = (float)dst->ne[0] / src0->ne[0];
+ const float sf1 = (float)dst->ne[1] / src0->ne[1];
+ const float sf2 = (float)dst->ne[2] / src0->ne[2];
+ const float sf3 = (float)dst->ne[3] / src0->ne[3];
+
+ ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
+ (uint32_t)ggml_nelements(dst), 0,
+ (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+ (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
+ sf0, sf1, sf2, sf3,
+ });
+}
+
+static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4364,7 +4566,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
});
}
-static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4377,7 +4579,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
});
}
-static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4391,7 +4593,20 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
});
}
-static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ const uint32_t src0_type_size = ggml_type_size(src0->type);
+ const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
+ (uint32_t)ggml_nelements(dst),
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
+ 0,
+ 0.0f, 0.0f,
+ });
+}
+
+static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@@ -4406,27 +4621,37 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
});
}
-static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
}
-static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ int * op_params = (int *)dst->op_params;
+
+ uint32_t num_groups = op_params[0];
+ uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+ static const float eps = 1e-6f;
+
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f });
+}
+
+static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
}
-static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
}
-static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
int32_t * op_params = (int32_t *)dst->op_params;
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
}
-static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
float * op_params = (float *)dst->op_params;
float scale = op_params[0];
@@ -4451,7 +4676,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
});
}
-static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const int n_dims = ((int32_t *) dst->op_params)[1];
// const int mode = ((int32_t *) dst->op_params)[2];
// const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -4475,7 +4700,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
});
}
-static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
int32_t * op_params = (int32_t *)dst->op_params;
uint32_t ncols = src0->ne[0];
@@ -4494,10 +4719,59 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
});
}
-static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f });
}
+static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ const int32_t s0 = dst->op_params[0];
+ const int32_t s1 = dst->op_params[1];
+ const int32_t p0 = dst->op_params[2];
+ const int32_t p1 = dst->op_params[3];
+ const int32_t d0 = dst->op_params[4];
+ const int32_t d1 = dst->op_params[5];
+
+ const bool is_2D = dst->op_params[6] == 1;
+
+ const uint32_t IC = src1->ne[is_2D ? 2 : 1];
+ const uint32_t IH = is_2D ? src1->ne[1] : 1;
+ const uint32_t IW = src1->ne[0];
+
+ const uint32_t KH = is_2D ? src0->ne[1] : 1;
+ const uint32_t KW = src0->ne[0];
+
+ const uint32_t OH = is_2D ? dst->ne[2] : 1;
+ const uint32_t OW = dst->ne[1];
+
+ const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+ const uint32_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
+
+ const uint32_t pelements = OW * KW * KH;
+
+ ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, {
+ batch_offset, offset_delta,
+ IC, IW, IH, OW, OH, KW, KH,
+ pelements,
+ IC * KH * KW,
+ s0, s1, p0, p1, d0, d1,
+ });
+}
+
+static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ const uint32_t dim = dst->op_params[0];
+ const uint32_t max_period = dst->op_params[1];
+ const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
+
+ ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
+ nb1, dim, max_period,
+ });
+}
+
+static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+ const float * op_params = (const float *)dst->op_params;
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f });
+}
+
#ifdef GGML_VULKAN_RUN_TESTS
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
@@ -4521,7 +4795,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
} else if (type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@@ -4555,7 +4829,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_s;
shname = "F16_ALIGNED_S";
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else if (shader_size == 1) {
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@@ -4571,7 +4845,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_m;
shname = "F16_ALIGNED_M";
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else if (shader_size == 2) {
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@@ -4587,7 +4861,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_l;
shname = "F16_ALIGNED_L";
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else {
GGML_ASSERT(0);
@@ -4668,7 +4942,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
for (size_t i = 0; i < y_ne; i++) {
@@ -4679,14 +4953,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
// y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
for (size_t i = 0; i < num_it; i++) {
ggml_vk_ctx_begin(ctx->device, subctx);
ggml_vk_matmul(
@@ -4727,14 +5001,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
src0_type = GGML_TYPE_F16;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (std::is_same<float, Y_TYPE>()) {
src1_type = GGML_TYPE_F32;
} else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
src1_type = GGML_TYPE_F16;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
@@ -4841,7 +5115,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
} else if (tensor->type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@@ -4894,7 +5168,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ggml_vk_ctx_begin(ctx->device, subctx);
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
@@ -5027,7 +5301,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
- vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
for (size_t i = 0; i < num_it; i++) {
ggml_vk_ctx_begin(ctx->device, subctx);
ggml_vk_matmul(
@@ -5175,7 +5449,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
- bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
+ bool mmp = (use_src0 && use_src1 && (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID)) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
@@ -5211,24 +5485,33 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_MUL:
case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_TANH:
break;
default:
return;
@@ -5236,6 +5519,13 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
break;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
+ if (
+ x_sz > ctx->device->max_memory_allocation_size ||
+ y_sz > ctx->device->max_memory_allocation_size ||
+ d_sz > ctx->device->max_memory_allocation_size ||
+ split_k_size > ctx->device->max_memory_allocation_size) {
+ GGML_ABORT("Requested preallocation size is too large");
+ }
if (ctx->prealloc_size_x < x_sz) {
ctx->prealloc_size_x = x_sz;
}
@@ -5391,7 +5681,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
std::cerr << std::endl;
}
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
#endif
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
@@ -5430,7 +5720,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
}
}
-static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
+static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node){
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
if (ggml_is_empty(node) || extra == nullptr) {
@@ -5457,7 +5747,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_TANH:
break;
default:
return;
@@ -5468,13 +5760,17 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
@@ -5483,109 +5779,147 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_MUL_MAT_ID:
case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
break;
default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
return;
}
- if (ctx->compute_ctx == nullptr) {
- ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
- ggml_vk_ctx_begin(ctx->device, ctx->compute_ctx);
+ vk_context compute_ctx;
+
+ if (ctx->compute_ctx.expired()) {
+ compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+ ctx->compute_ctx = compute_ctx;
+ ggml_vk_ctx_begin(ctx->device, compute_ctx);
+ } else {
+ compute_ctx = ctx->compute_ctx.lock();
}
switch (node->op) {
case GGML_OP_REPEAT:
- ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_repeat(ctx, compute_ctx, src0, node);
break;
case GGML_OP_GET_ROWS:
- ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_ADD:
- ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_add(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_MUL:
- ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_DIV:
- ggml_vk_div(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_div(ctx, compute_ctx, src0, src1, node);
+
+ break;
+ case GGML_OP_CONCAT:
+ ggml_vk_concat(ctx, compute_ctx, src0, src1, node);
+
+ break;
+ case GGML_OP_UPSCALE:
+ ggml_vk_upscale(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SCALE:
- ggml_vk_scale(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_scale(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SQR:
- ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_sqr(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CLAMP:
- ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_clamp(ctx, compute_ctx, src0, node);
+
+ break;
+ case GGML_OP_PAD:
+ ggml_vk_pad(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
- ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_cpy(ctx, compute_ctx, src0, node);
break;
case GGML_OP_NORM:
- ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_norm(ctx, compute_ctx, src0, node);
+
+ break;
+ case GGML_OP_GROUP_NORM:
+ ggml_vk_group_norm(ctx, compute_ctx, src0, node);
break;
case GGML_OP_RMS_NORM:
- ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_rms_norm(ctx, compute_ctx, src0, node);
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
- ggml_vk_unary(ctx, ctx->compute_ctx, src0, node);
+ case GGML_UNARY_OP_TANH:
+ ggml_vk_unary(ctx, compute_ctx, src0, node);
break;
default:
return;
}
break;
case GGML_OP_DIAG_MASK_INF:
- ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SOFT_MAX:
- ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_ROPE:
- ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
+ ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node);
break;
case GGML_OP_ARGSORT:
- ggml_vk_argsort(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_argsort(ctx, compute_ctx, src0, node);
break;
case GGML_OP_SUM_ROWS:
- ggml_vk_sum_rows(ctx, ctx->compute_ctx, src0, node);
+ ggml_vk_sum_rows(ctx, compute_ctx, src0, node);
+
+ break;
+ case GGML_OP_IM2COL:
+ ggml_vk_im2col(ctx, compute_ctx, src0, src1, node);
+
+ break;
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
+
+ break;
+ case GGML_OP_LEAKY_RELU:
+ ggml_vk_leaky_relu(ctx, compute_ctx, src0, node);
break;
case GGML_OP_MUL_MAT:
- ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node);
+ ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node);
break;
case GGML_OP_MUL_MAT_ID:
- ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, src2, node);
+ ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node);
break;
default:
return;
}
- extra->ctx_idx = ctx->compute_ctx->idx;
+ ctx->tensor_ctxs[node_idx] = compute_ctx;
#ifdef GGML_VULKAN_CHECK_RESULTS
// Force context reset on each node so that each tensor ends up in its own context
@@ -5594,13 +5928,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
#endif
if (last_node) {
- ggml_vk_ctx_end(ctx->compute_ctx);
- ctx->compute_ctx->exit_tensor = node;
- ctx->compute_ctx = nullptr;
+ ggml_vk_ctx_end(compute_ctx);
+ compute_ctx->exit_tensor_idx = node_idx;
+ ctx->compute_ctx.reset();
}
}
-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
ggml_tensor_extra_gpu * extra = nullptr;
switch (tensor->op) {
@@ -5608,13 +5942,17 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_GET_ROWS:
case GGML_OP_MUL:
case GGML_OP_DIV:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
@@ -5626,6 +5964,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_NONE:
case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
+ case GGML_OP_REPEAT:
extra = (ggml_tensor_extra_gpu *) tensor->extra;
break;
@@ -5633,7 +5975,9 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
switch (ggml_get_unary_op(tensor)) {
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_TANH:
extra = (ggml_tensor_extra_gpu *) tensor->extra;
break;
default:
@@ -5656,31 +6000,31 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
#ifdef GGML_VULKAN_CHECK_RESULTS
- ggml_vk_check_results_0(ctx, tensor);
+ ggml_vk_check_results_0(tensor);
#endif
- vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
+ vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
// Only run if ctx hasn't been submitted yet
- if (!subctx.seqs.empty()) {
+ if (!subctx->seqs.empty()) {
// Do staging buffer copies
- for (auto& cpy : subctx.in_memcpys) {
+ for (auto& cpy : subctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
- ggml_vk_submit(&subctx, ctx->fence);
+ ggml_vk_submit(subctx, ctx->fence);
}
- if (tensor == subctx.exit_tensor) {
+ if (tensor_idx == subctx->exit_tensor_idx) {
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
ctx->device->device.resetFences({ ctx->fence });
// Do staging buffer copies
- for (auto& cpy : subctx.out_memcpys) {
+ for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
- subctx.in_memcpys.clear();
- subctx.out_memcpys.clear();
+ subctx->in_memcpys.clear();
+ subctx->out_memcpys.clear();
}
return true;
@@ -5725,8 +6069,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
ctx->staging_offset = 0;
- ctx->compute_ctx = nullptr;
- ctx->transfer_ctx = nullptr;
+ ctx->tensor_ctxs.clear();
ctx->gc.contexts.clear();
}
@@ -6063,15 +6406,20 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
- if (ctx->transfer_ctx == nullptr) {
+ vk_context transfer_ctx;
+
+ if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ ctx->transfer_ctx = transfer_ctx;
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+ } else {
+ transfer_ctx = ctx->transfer_ctx.lock();
}
vk_buffer buf = extra->buffer_gpu.lock();
- ggml_vk_buffer_write_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
+ ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
}
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6081,15 +6429,20 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
- if (ctx->transfer_ctx == nullptr) {
+ vk_context transfer_ctx;
+
+ if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ ctx->transfer_ctx = transfer_ctx;
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+ } else {
+ transfer_ctx = ctx->transfer_ctx.lock();
}
vk_buffer buf = extra->buffer_gpu.lock();
- ggml_vk_buffer_read_async(ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
+ ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size, ctx->staging, ctx->staging_offset);
}
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6099,16 +6452,21 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
- if (ctx->transfer_ctx == nullptr) {
+ vk_context transfer_ctx;
+
+ if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
- ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
- ggml_vk_ctx_begin(ctx->device, ctx->transfer_ctx);
+ transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+ ctx->transfer_ctx = transfer_ctx;
+ ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+ } else {
+ transfer_ctx = ctx->transfer_ctx.lock();
}
vk_buffer src_buf = src_extra->buffer_gpu.lock();
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
- ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
+ ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
return true;
}
@@ -6118,25 +6476,27 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
- if(ctx->transfer_ctx == nullptr) {
+ if(ctx->transfer_ctx.expired()) {
return;
}
- ggml_vk_ctx_end(ctx->transfer_ctx);
+ vk_context transfer_ctx = ctx->transfer_ctx.lock();
+
+ ggml_vk_ctx_end(transfer_ctx);
- for (auto& cpy : ctx->transfer_ctx->in_memcpys) {
+ for (auto& cpy : transfer_ctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
- ggml_vk_submit(ctx->transfer_ctx, ctx->fence);
+ ggml_vk_submit(transfer_ctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
ctx->device->device.resetFences({ ctx->fence });
- for (auto& cpy : ctx->transfer_ctx->out_memcpys) {
+ for (auto& cpy : transfer_ctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
- ctx->transfer_ctx = nullptr;
+ ctx->transfer_ctx.reset();
}
static bool ggml_vk_is_empty(ggml_tensor * node) {
@@ -6159,8 +6519,11 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
last_node -= 1;
}
+ // Reserve tensor context space for all nodes
+ ctx->tensor_ctxs.resize(cgraph->n_nodes);
+
for (int i = 0; i < cgraph->n_nodes; i++) {
- ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node);
+ ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node);
}
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6170,13 +6533,17 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
continue;
}
- bool ok = ggml_vk_compute_forward(ctx, node);
+ bool ok = ggml_vk_compute_forward(ctx, node, i);
if (!ok) {
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+ if (node->op == GGML_OP_UNARY) {
+ std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
+ } else {
+ std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+ }
}
#ifdef GGML_VULKAN_CHECK_RESULTS
else {
- ggml_vk_check_results_1(ctx, node);
+ ggml_vk_check_results_1(node);
}
#endif
GGML_ASSERT(ok);
@@ -6196,8 +6563,10 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_GELU:
+ case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
+ case GGML_UNARY_OP_TANH:
return ggml_is_contiguous(op->src[0]);
default:
return false;
@@ -6270,11 +6639,11 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
}
return false;
} break;
- // case GGML_OP_REPEAT:
- // {
- // ggml_type src0_type = op->src[0]->type;
- // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
- // } break;
+ case GGML_OP_REPEAT:
+ {
+ ggml_type src0_type = op->src[0]->type;
+ return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+ } break;
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_NONE:
@@ -6283,18 +6652,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
+ case GGML_OP_GROUP_NORM:
+ case GGML_OP_RMS_NORM:
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
- case GGML_OP_RMS_NORM:
+ case GGML_OP_CONCAT:
+ case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
+ case GGML_OP_PAD:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS:
+ case GGML_OP_IM2COL:
+ case GGML_OP_TIMESTEP_EMBEDDING:
+ case GGML_OP_LEAKY_RELU:
return true;
default:
return false;
@@ -6498,7 +6874,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
} else if (tensor->type == GGML_TYPE_I32) {
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@@ -6509,10 +6885,12 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
}
}
-static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
+static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
void * tensor_data = tensor->data;
- if (ggml_backend_buffer_is_vk(tensor->buffer)) {
+ const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer);
+
+ if (is_gpu) {
const size_t tensor_size = ggml_nbytes(tensor);
tensor_data = malloc(tensor_size);
@@ -6533,13 +6911,10 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
std::cerr << std::endl << "Result:" << std::endl;
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
std::cerr << std::endl;
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
- std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
- if (ggml_backend_buffer_is_vk(tensor->buffer)) {
+ if (is_gpu) {
free(tensor_data);
}
}
@@ -6548,8 +6923,8 @@ void * comp_result;
size_t comp_size;
size_t comp_nb[GGML_MAX_DIMS];
size_t check_counter = 0;
-static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor) {
- if (tensor->op == GGML_OP_TRANSPOSE) {
+static void ggml_vk_check_results_0(ggml_tensor * tensor) {
+ if (tensor->op == GGML_OP_TRANSPOSE) {
return;
}
@@ -6565,7 +6940,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
ggml_tensor * src2 = tensor->src[2];
struct ggml_init_params iparams = {
- /*.mem_size =*/ 1024*1024*1024,
+ /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
@@ -6620,11 +6995,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
- ggml_vk_print_tensor(ctx, src0, "src0");
+ ggml_vk_print_tensor(src0, "src0");
}
}
if (src1 != nullptr) {
@@ -6662,27 +7037,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
- ggml_vk_print_tensor(ctx, src1, "src1");
- std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
- std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
- if (src1->src[0] != nullptr) {
- std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
- }
- if (src1->src[1] != nullptr) {
- std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
- }
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
- std::cerr << std::endl;
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0);
- std::cerr << std::endl;
- std::vector<const ggml_tensor *> done;
- ggml_vk_print_graph_origin(src1_clone, done);
+ ggml_vk_print_tensor(src1, "src1");
}
}
if (src2 != nullptr) {
@@ -6720,27 +7079,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
- ggml_vk_print_tensor(ctx, src2, "src2");
- std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
- std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
- if (src2->src[0] != nullptr) {
- std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
- }
- if (src2->src[1] != nullptr) {
- std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
- }
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
- std::cerr << std::endl;
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
- std::cerr << std::endl;
- std::vector<const ggml_tensor *> done;
- ggml_vk_print_graph_origin(src2_clone, done);
+ ggml_vk_print_tensor(src2, "src2");
}
}
@@ -6752,16 +7095,24 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone);
} else if (tensor->op == GGML_OP_DIV) {
tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone);
+ } else if (tensor->op == GGML_OP_CONCAT) {
+ tensor_clone = ggml_concat(ggml_ctx, src0_clone, src1_clone, *(int *)tensor->op_params);
+ } else if (tensor->op == GGML_OP_UPSCALE) {
+ tensor_clone = ggml_upscale_ext(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
} else if (tensor->op == GGML_OP_SCALE) {
tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
} else if (tensor->op == GGML_OP_SQR) {
tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_CLAMP) {
tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
+ } else if (tensor->op == GGML_OP_PAD) {
+ tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]);
} else if (tensor->op == GGML_OP_ADD) {
tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
} else if (tensor->op == GGML_OP_NORM) {
tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
+ } else if (tensor->op == GGML_OP_GROUP_NORM) {
+ tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params);
} else if (tensor->op == GGML_OP_RMS_NORM) {
tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
} else if (tensor->op == GGML_OP_SOFT_MAX) {
@@ -6777,12 +7128,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
const int mode = ((int32_t *) tensor->op_params)[2];
//const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
- float freq_base = ((float *) tensor->op_params)[5];
- float freq_scale = ((float *) tensor->op_params)[6];
- float ext_factor = ((float *) tensor->op_params)[7];
- float attn_factor = ((float *) tensor->op_params)[8];
- float beta_fast = ((float *) tensor->op_params)[9];
- float beta_slow = ((float *) tensor->op_params)[10];
+ const float freq_base = ((float *) tensor->op_params)[5];
+ const float freq_scale = ((float *) tensor->op_params)[6];
+ const float ext_factor = ((float *) tensor->op_params)[7];
+ const float attn_factor = ((float *) tensor->op_params)[8];
+ const float beta_fast = ((float *) tensor->op_params)[9];
+ const float beta_slow = ((float *) tensor->op_params)[10];
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
} else if (tensor->op == GGML_OP_UNARY) {
switch (ggml_get_unary_op(tensor)) {
@@ -6792,12 +7143,18 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_UNARY_OP_GELU:
tensor_clone = ggml_gelu(ggml_ctx, src0_clone);
break;
+ case GGML_UNARY_OP_GELU_QUICK:
+ tensor_clone = ggml_gelu_quick(ggml_ctx, src0_clone);
+ break;
case GGML_UNARY_OP_RELU:
tensor_clone = ggml_relu(ggml_ctx, src0_clone);
break;
+ case GGML_UNARY_OP_TANH:
+ tensor_clone = ggml_tanh(ggml_ctx, src0_clone);
+ break;
default:
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
if (src1 == nullptr) {
@@ -6823,9 +7180,26 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params);
} else if (tensor->op == GGML_OP_SUM_ROWS) {
tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
+ } else if (tensor->op == GGML_OP_IM2COL) {
+ const int32_t s0 = tensor->op_params[0];
+ const int32_t s1 = tensor->op_params[1];
+ const int32_t p0 = tensor->op_params[2];
+ const int32_t p1 = tensor->op_params[3];
+ const int32_t d0 = tensor->op_params[4];
+ const int32_t d1 = tensor->op_params[5];
+
+ const bool is_2D = tensor->op_params[6] == 1;
+ tensor_clone = ggml_im2col(ggml_ctx, src0_clone, src1_clone, s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
+ } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
+ const int32_t dim = tensor->op_params[0];
+ const int32_t max_period = tensor->op_params[1];
+ tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
+ } else if (tensor->op == GGML_OP_LEAKY_RELU) {
+ const float * op_params = (const float *)tensor->op_params;
+ tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
} else {
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
@@ -6834,7 +7208,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
- ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
+ ggml_vk_print_tensor(tensor_clone, "tensor_clone");
}
comp_size = ggml_nbytes(tensor_clone);
@@ -6851,9 +7225,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
}
ggml_free(ggml_ctx);
+
+ VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")");
}
-static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor) {
+static void ggml_vk_check_results_1(ggml_tensor * tensor) {
if (tensor->op == GGML_OP_TRANSPOSE) {
return;
}
@@ -6912,7 +7288,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
}
} else {
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
@@ -6935,7 +7311,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
first_error[0] = i0;
@@ -6977,11 +7353,6 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl << "Correct:" << std::endl;
ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0);
std::cerr << std::endl;
- std::cerr << std::endl << "Result:" << std::endl;
- ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
- std::cerr << std::endl << "Correct:" << std::endl;
- ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0);
- std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
}
@@ -7006,7 +7377,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
} else {
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
}
@@ -7018,5 +7389,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
free(tensor_data);
}
+
+ VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
}
#endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b5fdb96d..73054bfe 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -47,6 +47,9 @@
#include <unistd.h>
#endif
+#if defined(__ARM_FEATURE_SVE)
+int ggml_sve_cnt_b = 0;
+#endif
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
#undef GGML_USE_LLAMAFILE
#endif
@@ -63,6 +66,9 @@
// disable POSIX deprecation warnings
// these functions are never going away, anyway
#pragma warning(disable: 4996)
+
+// unreachable code because of multiple instances of code after GGML_ABORT
+#pragma warning(disable: 4702)
#endif
#if defined(_WIN32)
@@ -151,23 +157,69 @@ typedef pthread_t ggml_thread_t;
#include <sys/wait.h>
-void ggml_print_backtrace(void) {
- /*
- #include <execinfo.h>
- #include <dlfcn.h>
+#if defined(__ANDROID__)
+#include <unwind.h>
+#include <dlfcn.h>
+#include <stdio.h>
- void * trace[100];
+struct backtrace_state {
+ void ** current;
+ void ** end;
+};
- int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
+static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
+ struct backtrace_state * state = (struct backtrace_state *)arg;
+ uintptr_t pc = _Unwind_GetIP(context);
+ if (pc) {
+ if (state->current == state->end) {
+ return _URC_END_OF_STACK;
+ } else {
+ *state->current++ = (void*)pc;
+ }
+ }
+ return _URC_NO_REASON;
+}
+
+static void ggml_print_backtrace_symbols(void) {
+ const int max = 100;
+ void* buffer[max];
+ struct backtrace_state state = {buffer, buffer + max};
+ _Unwind_Backtrace(unwind_callback, &state);
+
+ int count = state.current - buffer;
+
+ for (int idx = 0; idx < count; ++idx) {
+ const void * addr = buffer[idx];
+ const char * symbol = "";
+
+ Dl_info info;
+ if (dladdr(addr, &info) && info.dli_sname) {
+ symbol = info.dli_sname;
+ }
+
+ fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
+ }
+}
+#elif defined(__linux__) && defined(__GLIBC__)
+#include <execinfo.h>
+static void ggml_print_backtrace_symbols(void) {
+ void * trace[100];
+ int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
- */
+}
+#else
+static void ggml_print_backtrace_symbols(void) {
+ // platform not supported
+}
+#endif
- // backtrack_symbols does not show line numbers, use gdb instead
+static void ggml_print_backtrace(void) {
char attach[32];
snprintf(attach, sizeof(attach), "attach %d", getpid());
int pid = fork();
if (pid == 0) {
+ // try gdb
execlp("gdb", "gdb", "--batch",
"-ex", "set style enabled on",
"-ex", attach,
@@ -175,16 +227,46 @@ void ggml_print_backtrace(void) {
"-ex", "detach",
"-ex", "quit",
(char *) NULL);
+ // try lldb
+ execlp("lldb", "lldb", "--batch",
+ "-o", "bt",
+ "-o", "quit",
+ "-p", attach,
+ (char *) NULL);
+ exit(EXIT_FAILURE);
} else {
- waitpid(pid, NULL, 0);
+ int wstatus;
+ waitpid(pid, &wstatus, 0);
+ if (WIFEXITED(wstatus)) {
+ if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
+ // gdb failed, fallback to backtrace_symbols
+ ggml_print_backtrace_symbols();
+ }
+ }
}
}
#else
-void ggml_print_backtrace(void) {
+static void ggml_print_backtrace(void) {
// platform not supported
}
#endif
+void ggml_abort(const char * file, int line, const char * fmt, ...) {
+ fflush(stdout);
+
+ fprintf(stderr, "%s:%d: ", file, line);
+
+ va_list args;
+ va_start(args, fmt);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+
+ fprintf(stderr, "\n");
+
+ ggml_print_backtrace();
+ abort();
+}
+
#define GGML_DEBUG 0
#define GGML_GELU_FP16
#define GGML_GELU_QUICK_FP16
@@ -256,7 +338,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
break;
}
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
return NULL;
}
return aligned_memory;
@@ -277,7 +359,7 @@ inline static void * ggml_malloc(size_t size) {
void * result = malloc(size);
if (result == NULL) {
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
return result;
}
@@ -291,7 +373,7 @@ inline static void * ggml_calloc(size_t num, size_t size) {
void * result = calloc(num, size);
if (result == NULL) {
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
return result;
}
@@ -414,9 +496,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
}
}
+void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
+ for (int i = 0; i < n; i++) {
+ y[i] = ggml_compute_fp32_to_bf16(x[i]);
+ }
+}
+
void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
int i = 0;
#if defined(__AVX512BF16__)
+ // subnormals are flushed to zero on this platform
for (; i + 32 <= n; i += 32) {
_mm512_storeu_si512(
(__m512i *)(y + i),
@@ -939,7 +1028,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.is_quantized = false,
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
- .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+ .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
.vec_dot_type = GGML_TYPE_BF16,
.nrows = 1,
@@ -2339,7 +2428,7 @@ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) {
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
//inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
-inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
+inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@@ -3544,7 +3633,7 @@ static inline int ggml_up(int n, int m) {
}
// assert that pointer is aligned to GGML_MEM_ALIGN
-#define ggml_assert_aligned(ptr) \
+#define GGML_ASSERT_ALIGNED(ptr) \
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
////////////////////////////////////////////////////////////////////////////////
@@ -3645,7 +3734,13 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_ASSERT(ctx->mem_buffer != NULL);
- ggml_assert_aligned(ctx->mem_buffer);
+ GGML_ASSERT_ALIGNED(ctx->mem_buffer);
+
+#if defined(__ARM_FEATURE_SVE)
+ if (!ggml_sve_cnt_b) {
+ ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+ }
+#endif
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
@@ -3777,7 +3872,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
.type = type,
};
- ggml_assert_aligned(mem_buffer + obj_new->offs);
+ GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs);
if (obj_cur != NULL) {
obj_cur->next = obj_new;
@@ -3801,7 +3896,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_tensor * view_src,
size_t view_offs) {
- assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
+ GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
+ GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
// find the base tensor and absolute offset
if (view_src != NULL && view_src->view_src != NULL) {
@@ -3878,7 +3974,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
#endif
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
- //ggml_assert_aligned(result->data);
+ //GGML_ASSERT_ALIGNED(result->data);
for (int i = 0; i < n_dims; i++) {
result->ne[i] = ne[i];
@@ -4051,8 +4147,8 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
return tensor;
@@ -4110,8 +4206,8 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
return tensor;
@@ -4180,11 +4276,9 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
}
default:
{
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
-
- return 0.0f;
}
void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
@@ -4227,8 +4321,8 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -4248,10 +4342,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
case GGML_TYPE_F32:
return ((float *) data)[0];
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
-
- return 0.0f;
}
void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
@@ -4283,8 +4375,8 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -4321,11 +4413,9 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
}
default:
{
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
-
- return 0.0f;
}
void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
@@ -4362,8 +4452,8 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -4383,10 +4473,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
case GGML_TYPE_F32:
return ((float *) data)[0];
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
-
- return 0.0f;
}
void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
@@ -4418,8 +4506,8 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -4442,8 +4530,11 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
}
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
- strncpy(tensor->name, name, sizeof(tensor->name) - 1);
- tensor->name[sizeof(tensor->name) - 1] = '\0';
+ size_t i;
+ for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) {
+ tensor->name[i] = name[i];
+ }
+ tensor->name[i] = '\0';
return tensor;
}
@@ -5014,7 +5105,7 @@ struct ggml_tensor * ggml_mean(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
is_node = true;
}
@@ -5037,7 +5128,7 @@ struct ggml_tensor * ggml_argmax(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
is_node = true;
}
@@ -5360,7 +5451,7 @@ static struct ggml_tensor * ggml_norm_impl(
bool is_node = false;
if (!inplace && (a->grad)) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -5459,17 +5550,19 @@ static struct ggml_tensor * ggml_group_norm_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_groups,
+ float eps,
bool inplace) {
bool is_node = false;
if (!inplace && (a->grad)) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- result->op_params[0] = n_groups;
+ ggml_set_op_params_i32(result, 0, n_groups);
+ ggml_set_op_params_f32(result, 1, eps);
result->op = GGML_OP_GROUP_NORM;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5481,15 +5574,17 @@ static struct ggml_tensor * ggml_group_norm_impl(
struct ggml_tensor * ggml_group_norm(
struct ggml_context * ctx,
struct ggml_tensor * a,
- int n_groups) {
- return ggml_group_norm_impl(ctx, a, n_groups, false);
+ int n_groups,
+ float eps) {
+ return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
}
struct ggml_tensor * ggml_group_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
- int n_groups) {
- return ggml_group_norm_impl(ctx, a, n_groups, true);
+ int n_groups,
+ float eps) {
+ return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
}
// ggml_mul_mat
@@ -5877,7 +5972,7 @@ struct ggml_tensor * ggml_reshape(
if (b->grad) {
// gradient propagation is not supported
- //GGML_ASSERT(false);
+ //GGML_ABORT("fatal error");
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
@@ -6660,7 +6755,7 @@ struct ggml_tensor * ggml_clamp(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -6736,7 +6831,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
bool is_node = false;
if (a->grad || b->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -6808,7 +6903,7 @@ struct ggml_tensor * ggml_im2col(
bool is_node = false;
if (a->grad || b->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -6894,7 +6989,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
bool is_node = false;
if (a->grad || b->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -6935,7 +7030,7 @@ struct ggml_tensor * ggml_pool_1d(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -6973,7 +7068,7 @@ struct ggml_tensor * ggml_pool_2d(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -7006,7 +7101,7 @@ static struct ggml_tensor * ggml_upscale_impl(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -7056,7 +7151,7 @@ struct ggml_tensor * ggml_pad(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -7105,7 +7200,7 @@ struct ggml_tensor * ggml_timestep_embedding(
bool is_node = false;
if (timesteps->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -7231,7 +7326,7 @@ struct ggml_tensor * ggml_flash_attn_back(
struct ggml_tensor * v,
struct ggml_tensor * d,
bool masked) {
- GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes");
+ GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes");
GGML_ASSERT(ggml_can_mul_mat(k, q));
// TODO: check if vT can be multiplied by (k*qT)
@@ -7330,7 +7425,7 @@ struct ggml_tensor * ggml_ssm_conv(
bool is_node = false;
if (s->grad || x->grad || c->grad || sq->grad) {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
is_node = true;
}
@@ -7384,7 +7479,7 @@ struct ggml_tensor * ggml_ssm_scan(
bool is_node = false;
if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
is_node = true;
}
@@ -7416,7 +7511,7 @@ struct ggml_tensor * ggml_win_part(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -7454,7 +7549,7 @@ struct ggml_tensor * ggml_win_unpart(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -7484,7 +7579,7 @@ struct ggml_tensor * ggml_get_rel_pos(
bool is_node = false;
if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
+ GGML_ABORT("fatal error"); // TODO: implement backward
is_node = true;
}
@@ -8174,7 +8269,7 @@ static void ggml_compute_forward_dup_f16(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
} else {
//printf("%s: this is not optimal - fix me\n", __func__);
@@ -8216,7 +8311,7 @@ static void ggml_compute_forward_dup_f16(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
}
return;
@@ -8333,7 +8428,7 @@ static void ggml_compute_forward_dup_f16(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
}
@@ -8460,7 +8555,7 @@ static void ggml_compute_forward_dup_bf16(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
} else {
//printf("%s: this is not optimal - fix me\n", __func__);
@@ -8520,7 +8615,7 @@ static void ggml_compute_forward_dup_bf16(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
}
return;
@@ -8689,7 +8784,7 @@ static void ggml_compute_forward_dup_bf16(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
}
@@ -8775,7 +8870,7 @@ static void ggml_compute_forward_dup_f32(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
} else {
//printf("%s: this is not optimal - fix me\n", __func__);
@@ -8835,7 +8930,7 @@ static void ggml_compute_forward_dup_f32(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
}
@@ -9006,7 +9101,7 @@ static void ggml_compute_forward_dup_f32(
}
}
} else {
- GGML_ASSERT(false); // TODO: implement
+ GGML_ABORT("fatal error"); // TODO: implement
}
}
@@ -9184,8 +9279,8 @@ static void ggml_compute_forward_dup(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -9337,7 +9432,7 @@ static void ggml_compute_forward_add_f16_f32(
}
else {
// src1 is not contiguous
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -9412,7 +9507,7 @@ static void ggml_compute_forward_add_bf16_f32(
}
else {
// src1 is not contiguous
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -9464,7 +9559,7 @@ static void ggml_compute_forward_add_f16_f16(
}
else {
// src1 is not contiguous
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -9516,7 +9611,7 @@ static void ggml_compute_forward_add_bf16_bf16(
}
else {
// src1 is not contiguous
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
}
@@ -9610,7 +9705,7 @@ static void ggml_compute_forward_add(
ggml_compute_forward_add_f32(params, dst);
}
else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} break;
case GGML_TYPE_F16:
@@ -9622,7 +9717,7 @@ static void ggml_compute_forward_add(
ggml_compute_forward_add_f16_f32(params, dst);
}
else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} break;
case GGML_TYPE_BF16:
@@ -9634,7 +9729,7 @@ static void ggml_compute_forward_add(
ggml_compute_forward_add_bf16_f32(params, dst);
}
else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} break;
case GGML_TYPE_Q4_0:
@@ -9672,8 +9767,8 @@ static void ggml_compute_forward_add(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10007,7 +10102,7 @@ static void ggml_compute_forward_add1(
ggml_compute_forward_add1_f16_f32(params, dst);
}
else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} break;
case GGML_TYPE_BF16:
@@ -10019,7 +10114,7 @@ static void ggml_compute_forward_add1(
ggml_compute_forward_add1_bf16_f32(params, dst);
}
else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} break;
case GGML_TYPE_Q4_0:
@@ -10058,8 +10153,8 @@ static void ggml_compute_forward_add1(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10191,8 +10286,8 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_Q4_0_8_8:
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10272,8 +10367,8 @@ static void ggml_compute_forward_sub(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10383,8 +10478,8 @@ static void ggml_compute_forward_mul(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10474,8 +10569,8 @@ static void ggml_compute_forward_div(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10519,8 +10614,8 @@ static void ggml_compute_forward_sqr(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10564,8 +10659,8 @@ static void ggml_compute_forward_sqrt(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10609,8 +10704,8 @@ static void ggml_compute_forward_log(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10738,8 +10833,8 @@ static void ggml_compute_forward_sum(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10791,8 +10886,8 @@ static void ggml_compute_forward_sum_rows(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10848,8 +10943,8 @@ static void ggml_compute_forward_mean(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -10896,8 +10991,8 @@ static void ggml_compute_forward_argmax(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11014,8 +11109,8 @@ static void ggml_compute_forward_repeat(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11092,8 +11187,8 @@ static void ggml_compute_forward_repeat_back(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11161,8 +11256,8 @@ static void ggml_compute_forward_concat(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11205,8 +11300,8 @@ static void ggml_compute_forward_abs(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11249,8 +11344,8 @@ static void ggml_compute_forward_sgn(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11293,8 +11388,8 @@ static void ggml_compute_forward_neg(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11337,8 +11432,8 @@ static void ggml_compute_forward_step(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11380,8 +11475,8 @@ static void ggml_compute_forward_tanh(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11424,8 +11519,8 @@ static void ggml_compute_forward_elu(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11468,8 +11563,8 @@ static void ggml_compute_forward_relu(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11512,8 +11607,8 @@ static void ggml_compute_forward_sigmoid(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11571,8 +11666,8 @@ static void ggml_compute_forward_gelu(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11630,8 +11725,8 @@ static void ggml_compute_forward_gelu_quick(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11689,8 +11784,8 @@ static void ggml_compute_forward_silu(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
// ggml_compute_forward_leaky_relu
@@ -11738,8 +11833,8 @@ static void ggml_compute_forward_leaky_relu(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11801,8 +11896,8 @@ static void ggml_compute_forward_silu_back(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11843,8 +11938,8 @@ static void ggml_compute_forward_hardswish(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11885,8 +11980,8 @@ static void ggml_compute_forward_hardsigmoid(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -11957,8 +12052,8 @@ static void ggml_compute_forward_norm(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -12025,8 +12120,8 @@ static void ggml_compute_forward_rms_norm(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -12198,8 +12293,8 @@ static void ggml_compute_forward_rms_norm_back(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -12220,10 +12315,11 @@ static void ggml_compute_forward_group_norm_f32(
GGML_TENSOR_UNARY_OP_LOCALS
- const float eps = 1e-6f; // TODO: make this a parameter
-
// TODO: optimize
+ float eps;
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
+
int n_channels = src0->ne[2];
int n_groups = dst->op_params[0];
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@@ -12292,8 +12388,8 @@ static void ggml_compute_forward_group_norm(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -12544,6 +12640,8 @@ UseGgmlGemm1:;
IQK_MulMat_Not_Available2:;
#endif
+ ggml_barrier(params->shared);
+
#if GGML_USE_LLAMAFILE
if (src1->type != vec_dot_type) {
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -12802,6 +12900,34 @@ IQK_MulMat_Not_Available:;
continue;
}
+ if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
+ int64_t src0_cur_start = (ith * ne01) / nth;
+ int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
+ src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
+ src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
+ if (src0_cur_start >= src0_cur_end) return;
+
+ for (int ir1 = 0; ir1 < nr1; ir1++) {
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
+ const int id = row_mapping.i1; // selected expert index
+
+ const int64_t i11 = id % ne11;
+ const int64_t i12 = row_mapping.i2; // row index in src1
+
+ const int64_t i1 = id; // selected expert index
+ const int64_t i2 = i12; // row
+
+ const char * src1_col = (const char *) wdata +
+ (src1_cont || src1->type != vec_dot_type
+ ? (i11 + i12 * ne11) * row_size
+ : (i11 * nb11 + i12 * nb12));
+
+ gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
+ (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
+ }
+ continue;
+ }
+
// distribute the thread work across the inner or outer loop based on which one is larger
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
@@ -13119,17 +13245,17 @@ static void ggml_compute_forward_out_prod(
} break;
case GGML_TYPE_F16:
{
- GGML_ASSERT(false); // todo
+ GGML_ABORT("fatal error"); // todo
// ggml_compute_forward_out_prod_f16_f32(params, dst);
- } break;
+ }
case GGML_TYPE_F32:
{
ggml_compute_forward_out_prod_f32(params, dst);
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -13188,8 +13314,8 @@ static void ggml_compute_forward_scale(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -13312,8 +13438,8 @@ static void ggml_compute_forward_set(
case GGML_TYPE_Q4_0_8_8:
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -13598,8 +13724,8 @@ static void ggml_compute_forward_get_rows(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
//static bool first = true;
@@ -13706,8 +13832,8 @@ static void ggml_compute_forward_get_rows_back(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
//static bool first = true;
@@ -13784,8 +13910,8 @@ static void ggml_compute_forward_diag(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -13854,8 +13980,8 @@ static void ggml_compute_forward_diag_mask_inf(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -13872,8 +13998,8 @@ static void ggml_compute_forward_diag_mask_zero(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -13990,8 +14116,8 @@ static void ggml_compute_forward_soft_max(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -14086,8 +14212,8 @@ static void ggml_compute_forward_soft_max_back(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -14186,8 +14312,8 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_F64:
case GGML_TYPE_COUNT:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -14516,8 +14642,8 @@ static void ggml_compute_forward_rope(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -14540,8 +14666,8 @@ static void ggml_compute_forward_rope_back(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -14740,8 +14866,8 @@ static void ggml_compute_forward_conv_transpose_1d(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -14912,8 +15038,8 @@ static void ggml_compute_forward_im2col(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -15024,7 +15150,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
const struct ggml_tensor * src = dst->src[0];
- assert(src->type == GGML_TYPE_F32);
+ assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
if (params->ith != 0) {
return;
@@ -15037,28 +15163,27 @@ static void ggml_compute_forward_pool_1d_sk_p0(
const int64_t rs = dst->ne[0];
while (cdata < data_end) {
- const float * const srow = (const float *)cdata;
-
+ const void * srow = (const void *)cdata;
int j = 0;
-
for (int64_t i = 0; i < rs; ++i) {
switch (op) {
case GGML_OP_POOL_AVG: drow[i] = 0; break;
case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break;
- case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+ case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
}
for (int ki = 0; ki < k; ++ki) {
+ const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
switch (op) {
- case GGML_OP_POOL_AVG: drow[i] += srow[j]; break;
- case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break;
- case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+ case GGML_OP_POOL_AVG: drow[i] += srow_j; break;
+ case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break;
+ case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
}
++j;
}
switch (op) {
case GGML_OP_POOL_AVG: drow[i] /= k; break;
case GGML_OP_POOL_MAX: break;
- case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+ case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
}
}
@@ -15092,7 +15217,7 @@ static void ggml_compute_forward_pool_2d(
const struct ggml_tensor * src = dst->src[0];
- GGML_ASSERT(src->type == GGML_TYPE_F32);
+ assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
if (params->ith != 0) {
return;
@@ -15127,7 +15252,7 @@ static void ggml_compute_forward_pool_2d(
switch (op) {
case GGML_OP_POOL_AVG: *out = 0; break;
case GGML_OP_POOL_MAX: *out = -FLT_MAX; break;
- case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+ case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
}
const int ix = offset0 + ox * s0;
@@ -15135,21 +15260,22 @@ static void ggml_compute_forward_pool_2d(
for (int ky = 0; ky < k1; ++ky) {
if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
- const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
+ const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky));
for (int kx = 0; kx < k0; ++kx) {
int j = ix + kx;
if (j < 0 || j >= src->ne[0]) continue;
+ const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
switch (op) {
- case GGML_OP_POOL_AVG: *out += srow[j]; break;
- case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
- case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+ case GGML_OP_POOL_AVG: *out += srow_j; break;
+ case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break;
+ case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
}
}
}
switch (op) {
case GGML_OP_POOL_AVG: *out /= ka; break;
case GGML_OP_POOL_MAX: break;
- case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+ case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
}
}
}
@@ -15213,8 +15339,8 @@ static void ggml_compute_forward_upscale(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -15271,8 +15397,8 @@ static void ggml_compute_forward_pad(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -15312,8 +15438,8 @@ static void ggml_compute_forward_arange(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -15363,8 +15489,8 @@ static void ggml_compute_forward_timestep_embedding(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -15422,8 +15548,8 @@ static void ggml_compute_forward_argsort(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -15645,8 +15771,8 @@ static void ggml_compute_forward_flash_attn_ext(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -15981,8 +16107,8 @@ static void ggml_compute_forward_flash_attn_back(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16103,8 +16229,8 @@ static void ggml_compute_forward_ssm_conv(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16224,8 +16350,8 @@ static void ggml_compute_forward_ssm_scan(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16287,8 +16413,8 @@ static void ggml_compute_forward_win_part(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16348,8 +16474,8 @@ static void ggml_compute_forward_win_unpart(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16416,8 +16542,8 @@ static void ggml_compute_forward_unary(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16463,8 +16589,8 @@ static void ggml_compute_forward_get_rel_pos(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16544,8 +16670,8 @@ static void ggml_compute_forward_add_rel_pos(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16590,8 +16716,8 @@ static void ggml_compute_forward_map_unary(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16639,8 +16765,8 @@ static void ggml_compute_forward_map_binary(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16838,8 +16964,8 @@ static void ggml_compute_forward_cross_entropy_loss(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -16925,8 +17051,8 @@ static void ggml_compute_forward_cross_entropy_loss_back(
} break;
default:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
@@ -17261,14 +17387,32 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_COUNT:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
}
////////////////////////////////////////////////////////////////////////////////
-static size_t ggml_hash_size(size_t min_sz) {
+struct ggml_hash_set ggml_hash_set_new(size_t size) {
+ size = ggml_hash_size(size);
+ struct ggml_hash_set result;
+ result.size = size;
+ result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
+ result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t));
+ return result;
+}
+
+void ggml_hash_set_reset(struct ggml_hash_set * hash_set) {
+ memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size));
+}
+
+void ggml_hash_set_free(struct ggml_hash_set * hash_set) {
+ GGML_FREE(hash_set->used);
+ GGML_FREE(hash_set->keys);
+}
+
+size_t ggml_hash_size(size_t min_sz) {
// next primes after powers of two
static const size_t primes[] = {
2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
@@ -17279,7 +17423,7 @@ static size_t ggml_hash_size(size_t min_sz) {
};
static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
- // find the smallest prime that is larger or equal to min_sz
+ // find the smallest prime that is larger or equal than min_sz
size_t l = 0;
size_t r = n_primes;
while (l < r) {
@@ -17294,67 +17438,6 @@ static size_t ggml_hash_size(size_t min_sz) {
return sz;
}
-static size_t ggml_hash(const void * p) {
- return (size_t)p;
-}
-
-size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
- size_t h = ggml_hash(key) % hash_set.size;
-
- // linear probing
- size_t i = h;
- while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
- i = (i + 1) % hash_set.size;
- if (i == h) {
- // visited all hash table entries -> not found
- return GGML_HASHTABLE_FULL;
- }
- }
- return i;
-}
-
-bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
- size_t i = ggml_hash_find(hash_set, key);
- return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
-}
-
-size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
- size_t i = ggml_hash_find(hash_set, key);
-
- GGML_ASSERT(i != GGML_HASHTABLE_FULL);
-
- if (hash_set.keys[i] == key) {
- return GGML_HASHTABLE_ALREADY_EXISTS;
- }
-
- // insert
- GGML_ASSERT(hash_set.keys[i] == NULL);
- hash_set.keys[i] = key;
- return i;
-}
-
-size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
- size_t i = ggml_hash_find(hash_set, key);
-
- GGML_ASSERT(i != GGML_HASHTABLE_FULL);
-
- hash_set.keys[i] = key;
- return i;
-}
-
-struct ggml_hash_set ggml_hash_set_new(size_t size) {
- size = ggml_hash_size(size);
- struct ggml_hash_set result;
- result.size = size;
- result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
- memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
- return result;
-}
-
-static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
- GGML_FREE(hash_set.keys);
-}
-
struct hash_map {
struct ggml_hash_set set;
struct ggml_tensor ** vals;
@@ -17363,13 +17446,12 @@ struct hash_map {
static struct hash_map * ggml_new_hash_map(size_t size) {
struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
result->set = ggml_hash_set_new(size);
- result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
- memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
+ result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *));
return result;
}
static void ggml_hash_map_free(struct hash_map * map) {
- ggml_hash_set_free(map->set);
+ ggml_hash_set_free(&map->set);
GGML_FREE(map->vals);
GGML_FREE(map);
}
@@ -17390,7 +17472,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
return node;
}
- if (!ggml_hash_contains(graph->visited_hash_table, node)) {
+ if (!ggml_hash_contains(&graph->visited_hash_set, node)) {
return node;
}
@@ -17405,8 +17487,8 @@ static struct ggml_tensor * ggml_recompute_graph_node(
return node;
}
- size_t i = ggml_hash_find(replacements->set, node);
- GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
+ size_t i = ggml_hash_find(&replacements->set, node);
+ GGML_ASSERT(i != GGML_HASHSET_FULL); // assert that not full
if (replacements->set.keys[i] == node) {
return replacements->vals[i];
}
@@ -17464,8 +17546,8 @@ void ggml_build_backward_gradient_checkpointing(
// insert checkpoints in replacements
for (int i = 0; i < n_checkpoints; ++i) {
- size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
- GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
+ size_t k = ggml_hash_find(&replacements->set, checkpoints[i]);
+ GGML_ASSERT(k != GGML_HASHSET_FULL); // assert that not full
GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
replacements->set.keys[k] = checkpoints[i];
replacements->vals[k] = checkpoints[i];
@@ -17493,7 +17575,7 @@ void ggml_build_backward_gradient_checkpointing(
// functions to change gradients considering the case that input a might be initial gradient with zero value
-static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
+static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
if (ggml_hash_contains(zero_table, a)) {
return b;
} else {
@@ -17501,7 +17583,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
}
}
-static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
+static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) {
if (ggml_hash_contains(zero_table, a)) {
struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
@@ -17510,7 +17592,7 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
}
}
-static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
+static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
if (ggml_hash_contains(zero_table, a)) {
return ggml_repeat(ctx, b, a);
} else {
@@ -17518,7 +17600,7 @@ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct g
}
}
-static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
+static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
if (ggml_hash_contains(zero_table, a)) {
return ggml_neg(ctx, b);
} else {
@@ -17526,7 +17608,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg
}
}
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
+static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table) {
struct ggml_tensor * src0 = tensor->src[0];
struct ggml_tensor * src1 = tensor->src[1];
struct ggml_tensor * src2 = tensor->src[2];
@@ -17695,8 +17777,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
case GGML_OP_MEAN:
case GGML_OP_ARGMAX:
{
- GGML_ASSERT(false); // TODO: implement
- } break;
+ GGML_ABORT("fatal error"); // TODO: implement
+ }
case GGML_OP_REPEAT:
{
// necessary for llama
@@ -17719,16 +17801,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_CONCAT:
{
- GGML_ASSERT(false); // TODO: implement
- } break;
+ GGML_ABORT("fatal error"); // TODO: implement
+ }
case GGML_OP_SILU_BACK:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_NORM:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_RMS_NORM:
{
// necessary for llama
@@ -17744,12 +17826,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_RMS_NORM_BACK:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_GROUP_NORM:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_MUL_MAT:
{
// https://cs231n.github.io/optimization-2/#staged
@@ -17810,12 +17892,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_MUL_MAT_ID:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_OUT_PROD:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_SCALE:
{
// necessary for llama
@@ -17991,12 +18073,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_GET_ROWS_BACK:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_DIAG:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_DIAG_MASK_INF:
{
// necessary for llama
@@ -18034,8 +18116,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_SOFT_MAX_BACK:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_ROPE:
{
// necessary for llama
@@ -18110,52 +18192,52 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_CLAMP:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_CONV_TRANSPOSE_1D:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_IM2COL:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_CONV_TRANSPOSE_2D:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_POOL_1D:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_POOL_2D:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_UPSCALE:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_PAD:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_ARANGE:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_TIMESTEP_EMBEDDING:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_ARGSORT:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_LEAKY_RELU:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_FLASH_ATTN_EXT:
{
struct ggml_tensor * flash_grad = NULL;
@@ -18211,13 +18293,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_FLASH_ATTN_BACK:
{
- GGML_ASSERT(false); // not supported
- } break;
+ GGML_ABORT("fatal error"); // not supported
+ }
case GGML_OP_SSM_CONV:
case GGML_OP_SSM_SCAN:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_WIN_PART:
case GGML_OP_WIN_UNPART:
case GGML_OP_UNARY:
@@ -18255,12 +18337,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_UNARY_OP_TANH:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_UNARY_OP_ELU:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_UNARY_OP_RELU:
{
if (src0->grad) {
@@ -18274,16 +18356,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_UNARY_OP_SIGMOID:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_UNARY_OP_GELU:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_UNARY_OP_GELU_QUICK:
{
- GGML_ASSERT(false); // TODO: not implemented
- } break;
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_UNARY_OP_SILU:
{
// necessary for llama
@@ -18295,7 +18377,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
}
} break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} break;
case GGML_OP_GET_REL_POS:
@@ -18309,8 +18391,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
case GGML_OP_MAP_CUSTOM2:
case GGML_OP_MAP_CUSTOM3:
{
- GGML_ASSERT(false); // not supported
- } break;
+ GGML_ABORT("fatal error"); // not supported
+ }
case GGML_OP_CROSS_ENTROPY_LOSS:
{
if (src0->grad) {
@@ -18325,16 +18407,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
} break;
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
{
- GGML_ASSERT(false); // not supported
- } break;
+ GGML_ABORT("fatal error"); // not supported
+ }
case GGML_OP_NONE:
{
// nop
} break;
case GGML_OP_COUNT:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -18354,7 +18436,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
}
// check if already visited
- if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
+ if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
return;
}
@@ -18400,7 +18482,6 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
}
const int n0 = cgraph->n_nodes;
- UNUSED(n0);
ggml_visit_parents(cgraph, tensor);
@@ -18436,7 +18517,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
for (int i = 0; i < gf->n_nodes; i++) {
if (gf->grads[i]) {
- ggml_hash_insert(zero_table, gf->grads[i]);
+ ggml_hash_insert(&zero_table, gf->grads[i]);
}
}
@@ -18446,7 +18527,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
// inplace operations to add gradients are not created by ggml_compute_backward
// use allocator to automatically make inplace operations
if (node->grad) {
- ggml_compute_backward(ctx, node, zero_table);
+ ggml_compute_backward(ctx, node, &zero_table);
}
}
@@ -18459,16 +18540,29 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
}
}
- ggml_hash_set_free(zero_table);
+ ggml_hash_set_free(&zero_table);
+}
+
+static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
+ void * ptr = *p;
+ ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
+ *p = (void *) ((char *) ptr + size);
+ return ptr;
}
static size_t ggml_graph_nbytes(size_t size, bool grads) {
- size_t nbytes = sizeof(struct ggml_cgraph);
- nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
+ size_t hash_size = ggml_hash_size(size * 2);
+ void * p = 0;
+ incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
+ incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
+ incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
+ incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
if (grads) {
- nbytes += size * sizeof(struct ggml_tensor *); // grads
+ incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
}
- nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
+ incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
+
+ size_t nbytes = (size_t) p;
return nbytes;
}
@@ -18485,19 +18579,19 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
- struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
-
+ // the size of the hash table is doubled since it needs to hold both nodes and leafs
size_t hash_size = ggml_hash_size(size * 2);
- struct ggml_tensor ** nodes_ptr = data_start;
- struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
- struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
- struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
- // check that we allocated the correct amount of memory
- assert(obj_size == (size_t) (
- (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
+ void * p = cgraph + 1;
- memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
+ struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+ struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+ struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+ struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
+ ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
+
+ // check that we allocated the correct amount of memory
+ assert(obj_size == (size_t)((char *)p - (char *)cgraph));
*cgraph = (struct ggml_cgraph) {
/*.size =*/ size,
@@ -18506,10 +18600,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
/*.nodes =*/ nodes_ptr,
/*.grads =*/ grads_ptr,
/*.leafs =*/ leafs_ptr,
- /*.hash_table =*/ { hash_size, hash_keys_ptr },
+ /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
};
+ ggml_hash_set_reset(&cgraph->visited_hash_set);
+
return cgraph;
}
@@ -18525,7 +18621,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
/*.nodes =*/ cgraph0->nodes + i0,
/*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
/*.leafs =*/ NULL,
- /*.hash_table =*/ { 0, NULL },
+ /*.hash_table =*/ { 0, NULL, NULL },
/*.order =*/ cgraph0->order,
};
@@ -18535,7 +18631,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
GGML_ASSERT(dst->size >= src->n_leafs);
GGML_ASSERT(dst->size >= src->n_nodes);
- GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
+ GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size);
dst->n_leafs = src->n_leafs;
dst->n_nodes = src->n_nodes;
@@ -18556,9 +18652,9 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
}
}
- for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
- if (src->visited_hash_table.keys[i]) {
- ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
+ for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
+ if (src->visited_hash_set.keys[i]) {
+ ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
}
}
}
@@ -18584,7 +18680,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
void ggml_graph_clear(struct ggml_cgraph * cgraph) {
cgraph->n_leafs = 0;
cgraph->n_nodes = 0;
- memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
+ ggml_hash_set_reset(&cgraph->visited_hash_set);
}
//
@@ -18779,7 +18875,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
n_tasks = MIN(ggml_nrows(node), n_threads);
} break;
default:
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
break;
case GGML_OP_SILU_BACK:
@@ -18906,8 +19002,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_COUNT:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
default:
{
fprintf(stderr, "%s: op not implemented: ", __func__);
@@ -18916,8 +19012,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} else {
fprintf(stderr, "%d\n", node->op);
}
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
}
assert(n_tasks > 0);
@@ -19027,7 +19123,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
cur += sizeof(float)*ne00*ne01*ne02;
cur += sizeof(float)*ne10*ne11;
} else {
- GGML_ASSERT(false);
+ GGML_ABORT("fatal error");
}
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
@@ -19073,8 +19169,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
} break;
case GGML_OP_COUNT:
{
- GGML_ASSERT(false);
- } break;
+ GGML_ABORT("fatal error");
+ }
default:
break;
}
@@ -20308,9 +20404,9 @@ static enum ggml_opt_result linesearch_backtracking(
(*step) *= width;
}
- GGML_ASSERT(false && "line search failed");
+ GGML_ABORT("line search failed");
- return GGML_LINESEARCH_FAIL;
+ //return GGML_LINESEARCH_FAIL;
}
static enum ggml_opt_result ggml_opt_lbfgs(
@@ -20578,9 +20674,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
step[0] = 1.0;
}
- GGML_ASSERT(false && "lbfgs failed");
+ GGML_ABORT("lbfgs failed");
- return GGML_OPT_RESULT_DID_NOT_CONVERGE;
+ //return GGML_OPT_RESULT_DID_NOT_CONVERGE;
}
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@@ -20925,7 +21021,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_BF16:
{
size_t elemsize = sizeof(ggml_bf16_t);
- ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n);
+ ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
result = n * elemsize;
} break;
case GGML_TYPE_F32:
@@ -21283,10 +21379,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
}
} break;
case GGUF_TYPE_ARRAY:
- default: GGML_ASSERT(false && "invalid type"); break;
+ default: GGML_ABORT("invalid type");
}
} break;
- default: GGML_ASSERT(false && "invalid type");
+ default: GGML_ABORT("invalid type");
}
if (!ok) {
@@ -21453,7 +21549,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ctx->infos[i].ne[3],
};
- struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
+ int n_dims = ctx->infos[i].n_dims;
+ if (n_dims == 0 || n_dims > 4) {
+ n_dims = 4;
+ for (; n_dims > 1; --n_dims) if (ne[n_dims-1] > 1) break;
+ }
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, n_dims, ne);
ok = ok && cur != NULL;
@@ -21867,12 +21968,12 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
GGML_FREE((void *)data);
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
- GGML_ASSERT(false && "nested arrays not supported");
+ GGML_ABORT("nested arrays not supported");
} else {
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
}
} break;
- default: GGML_ASSERT(false && "invalid type"); break;
+ default: GGML_ABORT("invalid type");
}
}
}
@@ -21881,7 +21982,7 @@ void gguf_add_tensor(
struct gguf_context * ctx,
const struct ggml_tensor * tensor) {
if (gguf_find_tensor(ctx, tensor->name) != -1) {
- GGML_ASSERT(false && "duplicated tensor name");
+ GGML_ABORT("duplicated tensor name");
}
const int idx = ctx->header.n_tensors;
@@ -21914,7 +22015,7 @@ void gguf_add_tensor(
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
- GGML_ASSERT(false && "tensor not found");
+ GGML_ABORT("tensor not found");
}
ctx->infos[idx].type = type;
@@ -21923,7 +22024,7 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
- GGML_ASSERT(false && "tensor not found");
+ GGML_ABORT("tensor not found");
}
ctx->infos[idx].data = data;
@@ -22052,10 +22153,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
}
} break;
case GGUF_TYPE_ARRAY:
- default: GGML_ASSERT(false && "invalid type"); break;
+ default: GGML_ABORT("invalid type");
}
} break;
- default: GGML_ASSERT(false && "invalid type");
+ default: GGML_ABORT("invalid type");
}
}
@@ -22116,7 +22217,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
FILE * file = ggml_fopen(fname, "wb");
if (!file) {
- GGML_ASSERT(false && "failed to open file for writing");
+ GGML_ABORT("failed to open file for writing");
}
struct gguf_buf buf = gguf_buf_init(16*1024);
diff --git a/ggml/src/vulkan-shaders/CMakeLists.txt b/ggml/src/vulkan-shaders/CMakeLists.txt
index 41551e00..10075db3 100644
--- a/ggml/src/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/vulkan-shaders/CMakeLists.txt
@@ -1,5 +1,7 @@
+find_package (Threads REQUIRED)
set(TARGET vulkan-shaders-gen)
add_executable(${TARGET} vulkan-shaders-gen.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
diff --git a/ggml/src/vulkan-shaders/add.comp b/ggml/src/vulkan-shaders/add.comp
index 8475b011..3974845d 100644
--- a/ggml/src/vulkan-shaders/add.comp
+++ b/ggml/src/vulkan-shaders/add.comp
@@ -4,9 +4,11 @@
#include "generic_binary_head.comp"
void main() {
- if (gl_GlobalInvocationID.x >= p.ne) {
+ const uint idx = get_idx();
+
+ if (idx >= p.ne) {
return;
}
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
}
diff --git a/ggml/src/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp
index ca272e22..7071302a 100644
--- a/ggml/src/vulkan-shaders/clamp.comp
+++ b/ggml/src/vulkan-shaders/clamp.comp
@@ -4,10 +4,12 @@
#include "generic_unary_head.comp"
void main() {
- if (gl_GlobalInvocationID.x >= p.ne) {
+ const uint idx = get_idx();
+
+ if (idx >= p.ne) {
return;
}
- const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
}
diff --git a/ggml/src/vulkan-shaders/concat.comp b/ggml/src/vulkan-shaders/concat.comp
new file mode 100644
index 00000000..08ab5514
--- /dev/null
+++ b/ggml/src/vulkan-shaders/concat.comp
@@ -0,0 +1,35 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+void main() {
+ const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+ const int dim = p.param3;
+
+ if (idx >= p.ne) {
+ return;
+ }
+
+ const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
+ const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
+ const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
+ const uint i2_offset = i2*p.ne21*p.ne20;
+ const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
+ const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
+
+ uint o[4] = {0, 0, 0, 0};
+ o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
+
+ const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
+ const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
+ const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
+
+ const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+ data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
+#else
+ data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
+#endif
+}
diff --git a/ggml/src/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp
index efb55876..c26917c0 100644
--- a/ggml/src/vulkan-shaders/copy.comp
+++ b/ggml/src/vulkan-shaders/copy.comp
@@ -4,13 +4,15 @@
#include "generic_unary_head.comp"
void main() {
- if (gl_GlobalInvocationID.x >= p.ne) {
+ const uint idx = get_idx();
+
+ if (idx >= p.ne) {
return;
}
#ifndef OPTIMIZATION_ERROR_WORKAROUND
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
#else
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
+ data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
#endif
}
diff --git a/ggml/src/vulkan-shaders/div.comp b/ggml/src/vulkan-shaders/div.comp
index 8ee4bfc7..8cfce58b 100644
--- a/ggml/src/vulkan-shaders/div.comp
+++ b/ggml/src/vulkan-shaders/div.comp
@@ -4,9 +4,11 @@
#include "generic_binary_head.comp"
void main() {
- if (gl_GlobalInvocationID.x >= p.ne) {
+ const uint idx = get_idx();
+
+ if (idx >= p.ne) {
return;
}
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
}
diff --git a/ggml/src/vulkan-shaders/gelu.comp b/ggml/src/vulkan-shaders/gelu.comp
index 9fe807cc..4cc7a68c 100644
--- a/ggml/src/vulkan-shaders/gelu.comp
+++ b/ggml/src/vulkan-shaders/gelu.comp
@@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
- const uint i = gl_GlobalInvocationID.x;
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
diff --git a/ggml/src/vulkan-shaders/gelu_quick.comp b/ggml/src/vulkan-shaders/gelu_quick.comp
new file mode 100644
index 00000000..e6e6fcfd
--- /dev/null
+++ b/ggml/src/vulkan-shaders/gelu_quick.comp
@@ -0,0 +1,23 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+ const float GELU_QUICK_COEF = -1.702f;
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+ if (i >= p.KX) {
+ return;
+ }
+
+ const float x = float(data_a[i]);
+ data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
+}
diff --git a/ggml/src/vulkan-shaders/generic_binary_head.comp b/ggml/src/vulkan-shaders/generic_binary_head.comp
index ab45d256..b6beaff1 100644
--- a/ggml/src/vulkan-shaders/generic_binary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_binary_head.comp
@@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
uint d_offset;
- float param1; float param2;
+ float param1; float param2; int param3;
} p;
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
@@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+uint get_idx() {
+ return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+}
+
uint src0_idx(uint idx) {
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
diff --git a/ggml/src/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp
index de08de7c..eacdefc7 100644
--- a/ggml/src/vulkan-shaders/generic_unary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_unary_head.comp
@@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+uint get_idx() {
+ return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+}
+
uint src0_idx(uint idx) {
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
diff --git a/ggml/src/vulkan-shaders/group_norm.comp b/ggml/src/vulkan-shaders/group_norm.comp
new file mode 100644
index 00000000..5ad9b28d
--- /dev/null
+++ b/ggml/src/vulkan-shaders/group_norm.comp
@@ -0,0 +1,66 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 512
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+shared float tmp[BLOCK_SIZE];
+
+void main() {
+ const uint group_size = p.KX;
+ const float eps = p.param1;
+
+ const uint tid = gl_LocalInvocationID.x;
+ const uint start = gl_WorkGroupID.x * group_size + tid;
+ const uint end = start + group_size;
+
+ tmp[tid] = 0.0f;
+
+ // Calculate mean
+ [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+ tmp[tid] += float(data_a[col]);
+ }
+
+ // tmp up partial tmps and write back result
+ barrier();
+ [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+ if (tid < s) {
+ tmp[tid] += tmp[tid + s];
+ }
+ barrier();
+ }
+
+ const float mean = tmp[0] / group_size;
+ barrier();
+ tmp[tid] = 0.0f;
+
+ // Calculate variance
+ [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+ const float xi = float(data_a[col]) - mean;
+ data_d[col] = D_TYPE(xi);
+ tmp[tid] += xi * xi;
+ }
+
+ // sum up partial sums and write back result
+ barrier();
+ [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+ if (tid < s) {
+ tmp[tid] += tmp[tid + s];
+ }
+ barrier();
+ }
+
+ const float variance = tmp[0] / group_size;
+ const float scale = inversesqrt(variance + eps);
+
+ [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+ data_d[col] *= D_TYPE(scale);
+ }
+}
diff --git a/ggml/src/vulkan-shaders/im2col.comp b/ggml/src/vulkan-shaders/im2col.comp
new file mode 100644
index 00000000..4d48610a
--- /dev/null
+++ b/ggml/src/vulkan-shaders/im2col.comp
@@ -0,0 +1,57 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+
+layout (push_constant) uniform parameter
+{
+ uint batch_offset; uint offset_delta;
+ uint IC;
+ uint IW; uint IH;
+ uint OW; uint OH;
+ uint KW; uint KH;
+ uint pelements;
+ uint CHW;
+ int s0; int s1;
+ int p0; int p1;
+ int d0; int d1;
+} p;
+
+#include "types.comp"
+
+#define BLOCK_SIZE 256
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+ const uint i = gl_GlobalInvocationID.x;
+ if (i >= p.pelements) {
+ return;
+ }
+
+ const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
+ const uint kx = i / ksize;
+ const uint kd = kx * ksize;
+ const uint ky = (i - kd) / p.OW;
+ const uint ix = i % p.OW;
+
+ const uint oh = gl_GlobalInvocationID.y;
+ const uint batch = gl_GlobalInvocationID.z / p.IC;
+ const uint ic = gl_GlobalInvocationID.z % p.IC;
+
+ const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
+ const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
+
+ const uint offset_dst =
+ ((batch * p.OH + oh) * p.OW + ix) * p.CHW +
+ (ic * (p.KW * p.KH) + ky * p.KW + kx);
+
+ if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
+ data_d[offset_dst] = D_TYPE(0.0f);
+ } else {
+ const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
+ data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
+ }
+}
diff --git a/ggml/src/vulkan-shaders/leaky_relu.comp b/ggml/src/vulkan-shaders/leaky_relu.comp
new file mode 100644
index 00000000..d90a99ae
--- /dev/null
+++ b/ggml/src/vulkan-shaders/leaky_relu.comp
@@ -0,0 +1,22 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+ if (i >= p.KX) {
+ return;
+ }
+
+ const float val = float(data_a[i]);
+ data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
+}
diff --git a/ggml/src/vulkan-shaders/mul.comp b/ggml/src/vulkan-shaders/mul.comp
index bbb0aa1d..bfb61c92 100644
--- a/ggml/src/vulkan-shaders/mul.comp
+++ b/ggml/src/vulkan-shaders/mul.comp
@@ -4,9 +4,11 @@
#include "generic_binary_head.comp"
void main() {
- if (gl_GlobalInvocationID.x >= p.ne) {
+ const uint idx = get_idx();
+
+ if (idx >= p.ne) {
return;
}
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
}
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec.comp b/ggml/src/vulkan-shaders/mul_mat_vec.comp
index 15d2a806..46a6369b 100644
--- a/ggml/src/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/vulkan-shaders/mul_mat_vec.comp
@@ -16,6 +16,13 @@ void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
const uint tid = gl_LocalInvocationID.x;
+ // There are not enough cols to use all threads
+ if (tid >= p.ncols) {
+ return;
+ }
+
+ const uint block_size = min(p.ncols, BLOCK_SIZE);
+
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
@@ -23,8 +30,8 @@ void main() {
tmp[tid] = FLOAT_TYPE(0.0f);
- [[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
- const uint col = i*BLOCK_SIZE + 2*tid;
+ [[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
+ const uint col = i*block_size + 2*tid;
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
const uint iybs = col - col%QUANT_K; // y block start index
@@ -38,7 +45,7 @@ void main() {
// sum up partial sums and write back result
barrier();
- [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
+ [[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
diff --git a/ggml/src/vulkan-shaders/norm.comp b/ggml/src/vulkan-shaders/norm.comp
index 803dbdcb..6627a50b 100644
--- a/ggml/src/vulkan-shaders/norm.comp
+++ b/ggml/src/vulkan-shaders/norm.comp
@@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
shared vec2 sum[BLOCK_SIZE];
void main() {
- const uint row = gl_WorkGroupID.x;
+ const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;
sum[tid] = vec2(0.0f, 0.0f);
diff --git a/ggml/src/vulkan-shaders/pad.comp b/ggml/src/vulkan-shaders/pad.comp
new file mode 100644
index 00000000..a465cd52
--- /dev/null
+++ b/ggml/src/vulkan-shaders/pad.comp
@@ -0,0 +1,26 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+void main() {
+ const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+ if (idx >= p.ne) {
+ return;
+ }
+
+ const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
+ const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
+ const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
+ const uint i2_offset = i2*p.ne11*p.ne10;
+ const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
+ const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
+
+ const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
+ const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
+
+ const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+
+ data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
+}
diff --git a/ggml/src/vulkan-shaders/relu.comp b/ggml/src/vulkan-shaders/relu.comp
index 7e5baa5b..52a19b62 100644
--- a/ggml/src/vulkan-shaders/relu.comp
+++ b/ggml/src/vulkan-shaders/relu.comp
@@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
- const uint i = gl_GlobalInvocationID.x;
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
diff --git a/ggml/src/vulkan-shaders/rms_norm.comp b/ggml/src/vulkan-shaders/rms_norm.comp
index cfd08d34..b554400b 100644
--- a/ggml/src/vulkan-shaders/rms_norm.comp
+++ b/ggml/src/vulkan-shaders/rms_norm.comp
@@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
shared FLOAT_TYPE sum[BLOCK_SIZE];
void main() {
- const uint row = gl_WorkGroupID.x;
+ const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
diff --git a/ggml/src/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp
index 510cb723..5cd2f668 100644
--- a/ggml/src/vulkan-shaders/scale.comp
+++ b/ggml/src/vulkan-shaders/scale.comp
@@ -4,9 +4,11 @@
#include "generic_unary_head.comp"
void main() {
- if (gl_GlobalInvocationID.x >= p.ne) {
+ const uint idx = get_idx();
+
+ if (idx >= p.ne) {
return;
}
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
}
diff --git a/ggml/src/vulkan-shaders/silu.comp b/ggml/src/vulkan-shaders/silu.comp
index 15920f06..4d36f88e 100644
--- a/ggml/src/vulkan-shaders/silu.comp
+++ b/ggml/src/vulkan-shaders/silu.comp
@@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
- const uint i = gl_GlobalInvocationID.x;
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
diff --git a/ggml/src/vulkan-shaders/soft_max.comp b/ggml/src/vulkan-shaders/soft_max.comp
index 1b8419c7..0bd51eca 100644
--- a/ggml/src/vulkan-shaders/soft_max.comp
+++ b/ggml/src/vulkan-shaders/soft_max.comp
@@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
void main() {
const uint tid = gl_LocalInvocationID.x;
- const uint rowx = gl_WorkGroupID.x;
+ const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint rowy = rowx % p.KY;
float slope = 1.0f;
diff --git a/ggml/src/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp
index 8dd19333..1fa118c9 100644
--- a/ggml/src/vulkan-shaders/square.comp
+++ b/ggml/src/vulkan-shaders/square.comp
@@ -4,10 +4,12 @@
#include "generic_unary_head.comp"
void main() {
- if (gl_GlobalInvocationID.x >= p.ne) {
+ const uint idx = get_idx();
+
+ if (idx >= p.ne) {
return;
}
- const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
- data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val);
+ const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
+ data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
}
diff --git a/ggml/src/vulkan-shaders/sum_rows.comp b/ggml/src/vulkan-shaders/sum_rows.comp
index ce2f1e2f..961e5ffa 100644
--- a/ggml/src/vulkan-shaders/sum_rows.comp
+++ b/ggml/src/vulkan-shaders/sum_rows.comp
@@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() {
- const uint row = gl_WorkGroupID.x;
+ const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint col = gl_LocalInvocationID.x;
tmp[col] = FLOAT_TYPE(0.0f);
diff --git a/ggml/src/vulkan-shaders/tanh.comp b/ggml/src/vulkan-shaders/tanh.comp
new file mode 100644
index 00000000..74630dc7
--- /dev/null
+++ b/ggml/src/vulkan-shaders/tanh.comp
@@ -0,0 +1,21 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+ const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+ if (i >= p.KX) {
+ return;
+ }
+
+ data_d[i] = D_TYPE(tanh(data_a[i]));
+}
diff --git a/ggml/src/vulkan-shaders/timestep_embedding.comp b/ggml/src/vulkan-shaders/timestep_embedding.comp
new file mode 100644
index 00000000..79e065a9
--- /dev/null
+++ b/ggml/src/vulkan-shaders/timestep_embedding.comp
@@ -0,0 +1,41 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+
+layout (push_constant) uniform parameter
+{
+ uint nb1;
+ uint dim;
+ uint max_period;
+} p;
+
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 256
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+ const uint i = gl_WorkGroupID.y;
+ const uint j = gl_GlobalInvocationID.x;
+ const uint d_offset = i * p.nb1;
+
+ if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
+ data_d[d_offset + p.dim] = 0.f;
+ }
+
+ const uint half_dim = p.dim / 2;
+ if (j >= half_dim) {
+ return;
+ }
+
+ const float timestep = float(data_a[i]);
+ const float freq = float(exp(-log(p.max_period) * j / half_dim));
+ const float arg = timestep * freq;
+ data_d[d_offset + j] = D_TYPE(cos(arg));
+ data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
+}
diff --git a/ggml/src/vulkan-shaders/types.comp b/ggml/src/vulkan-shaders/types.comp
index d24c172c..21dce72f 100644
--- a/ggml/src/vulkan-shaders/types.comp
+++ b/ggml/src/vulkan-shaders/types.comp
@@ -6,7 +6,7 @@
#define QUANT_K 1
#define QUANT_R 1
-#ifndef LOAD_VEC_A
+#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
#define A_TYPE float
#elif LOAD_VEC_A == 4
#define A_TYPE vec4
@@ -19,7 +19,7 @@
#define QUANT_K 1
#define QUANT_R 1
-#ifndef LOAD_VEC_A
+#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
#define A_TYPE float16_t
#elif LOAD_VEC_A == 4
#define A_TYPE f16vec4
diff --git a/ggml/src/vulkan-shaders/upscale.comp b/ggml/src/vulkan-shaders/upscale.comp
new file mode 100644
index 00000000..511a086e
--- /dev/null
+++ b/ggml/src/vulkan-shaders/upscale.comp
@@ -0,0 +1,36 @@
+#version 450
+
+layout (push_constant) uniform parameter
+{
+ uint ne; uint d_offset;
+ uint nb00; uint nb01; uint nb02; uint nb03;
+ uint ne10; uint ne11; uint ne12; uint ne13;
+ float sf0; float sf1; float sf2; float sf3;
+} p;
+
+#include "types.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+ const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+ if (idx >= p.ne) {
+ return;
+ }
+
+ const uint i10 = idx % p.ne10;
+ const uint i11 = (idx / p.ne10) % p.ne11;
+ const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
+ const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
+
+ const uint i00 = uint(i10 / p.sf0);
+ const uint i01 = uint(i11 / p.sf1);
+ const uint i02 = uint(i12 / p.sf2);
+ const uint i03 = uint(i13 / p.sf3);
+
+ data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
+}
diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
index c5be3754..a792e203 100644
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -22,6 +22,7 @@
#ifdef _WIN32
#include <windows.h>
#include <direct.h> // For _mkdir on Windows
+ #include <algorithm> // For std::replace on w64devkit
#else
#include <unistd.h>
#include <sys/wait.h>
@@ -179,11 +180,7 @@ bool string_ends_with(const std::string& str, const std::string& suffix) {
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
}
-#ifdef _WIN32
- static const char path_separator = '\\';
-#else
- static const char path_separator = '/';
-#endif
+static const char path_separator = '/';
std::string join_paths(const std::string& path1, const std::string& path2) {
return path1 + path_separator + path2;
@@ -198,7 +195,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
std::string out_fname = join_paths(output_dir, name + ".spv");
std::string in_path = join_paths(input_dir, in_fname);
- std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
+ #ifdef _WIN32
+ std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
+ #else
+ std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
+ #endif
for (const auto& define : defines) {
cmd.push_back("-D" + define.first + "=" + define.second);
}
@@ -269,9 +270,12 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
for (const auto& tname : type_names) {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
+ // For unaligned, load one at a time for f32/f16, or two at a time for quants
+ std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
+ // For aligned matmul loads
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
tasks.push_back(std::async(std::launch::async, [=] {
- string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
+ string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
@@ -341,6 +345,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
+ string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+ }));
+ tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
@@ -357,6 +364,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
+ tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
+ }));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
@@ -383,14 +393,41 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
}));
tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+ }));
+
+ tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+ }));
+ tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+ }));
+ tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
+ }));
+
+ tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+ }));
+
+ tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+ }));
+ tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
+ tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+ }));
+ tasks.push_back(std::async(std::launch::async, [] {
+ string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+ }));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
@@ -424,6 +461,17 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
+
+ tasks.push_back(std::async(std::launch::async, [=] {
+ string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+ }));
+ tasks.push_back(std::async(std::launch::async, [=] {
+ string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
+ }));
+
+ tasks.push_back(std::async(std::launch::async, [=] {
+ string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+ }));
}
void write_output_files() {
@@ -435,10 +483,16 @@ void write_output_files() {
for (const auto& pair : shader_fnames) {
const std::string& name = pair.first;
- const std::string& path = pair.second;
+ #ifdef _WIN32
+ std::string path = pair.second;
+ std::replace(path.begin(), path.end(), '/', '\\' );
+ #else
+ const std::string& path = pair.second;
+ #endif
+
FILE* spv = fopen(path.c_str(), "rb");
if (!spv) {
- std::cerr << "Error opening SPIR-V file: " << path << "\n";
+ std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
continue;
}
@@ -450,7 +504,7 @@ void write_output_files() {
size_t read_size = fread(data.data(), 1, size, spv);
fclose(spv);
if (read_size != size) {
- std::cerr << "Error reading SPIR-V file: " << path << "\n";
+ std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
continue;
}