diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-08-12 15:14:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-08-12 15:14:32 +0200 |
commit | 8f43e551038af2547b5c01d0e9edd641c0e4bd29 (patch) | |
tree | 07a4373620a9381d0b5c7189a475990a6feb48a5 /ggml/src/ggml-cann/aclnn_ops.cpp | |
parent | f5d1af61d79fb53ccfbac2e665e43208c07b083d (diff) |
Merge mainline - Aug 12 2024 (#17)
* Merge mainline
* Fix after merge
* Remove CI check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-cann/aclnn_ops.cpp')
-rw-r--r-- | ggml/src/ggml-cann/aclnn_ops.cpp | 244 |
1 files changed, 191 insertions, 53 deletions
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a02efc82..8c4132f5 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src = ggml_cann_create_tensor(src); aclTensor* acl_dst = ggml_cann_create_tensor(dst); - const float eps = 1e-6f; // TODO: make this a parameter int n_groups = dst->op_params[0]; + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -844,7 +846,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_max_pool2d(ctx, dst); break; case GGML_OP_POOL_COUNT: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -910,6 +912,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->ne); return; } + if (dst->type == GGML_TYPE_Q4_0) { + aclrtlaunch_ascendc_quantize_f16_to_q4_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } if (dst->type == GGML_TYPE_F16) { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -931,9 +940,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (dst->type == GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { @@ -955,12 +964,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // TODO - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else if (src->type == GGML_TYPE_F32) { // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size // && nb0 == type_size) @@ -971,6 +980,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->ne); return; } + if (dst->type == GGML_TYPE_Q4_0) { + aclrtlaunch_ascendc_quantize_f32_to_q4_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } if (dst->type == GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -991,10 +1007,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { // TODO: dst not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } if (dst->type == GGML_TYPE_F16) { @@ -1017,11 +1033,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } // TODO - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -1029,7 +1045,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_dst)); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -1312,6 +1328,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize, #ifdef __cplusplus } #endif + +static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + ggml_tensor* src1, + aclTensor* tmp_cast_tensor, + aclTensor* tmp_im2col_tensor) { + // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] + int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; + size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + } + + // release + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void ggml_cann_im2col_1d_post_process( + ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1, + aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor, + const std::vector<int64_t>& im2col_op_params) { + // get params + const int64_t KH = im2col_op_params[0]; + const int64_t KW = im2col_op_params[1]; + const int64_t IW = im2col_op_params[2]; + const int64_t IC = im2col_op_params[3]; + const int64_t N = im2col_op_params[4]; + const int64_t OH = im2col_op_params[5]; + const int64_t OW = im2col_op_params[6]; + const int64_t s0 = im2col_op_params[7]; + const int64_t p0 = im2col_op_params[8]; + const int64_t d0 = im2col_op_params[9]; + const int64_t n_bytes_factor = im2col_op_params[10]; + + // Permute: [N, IC * KH * KW, OW * OH] -> + // [N, OW * OH * n_bytes_factor, IC * KH * KW] + aclTensor* tmp_permute_tensor = nullptr; + ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool()); + tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + void* tmp_permute_buffer = tmp_permute_allocator.get(); + + int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N}; + size_t tmp_permute_nb[GGML_MAX_DIMS - 1]; + tmp_permute_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; + } + + tmp_permute_tensor = ggml_cann_create_tensor( + tmp_permute_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb, + GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, + 3); + } + + // number of times the kernel moves in W dimension + const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1; + size_t offset; + void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer; + + // memory copy with offset to restore 1D im2col from 2d + if (IC > 1) { + offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type); + size_t size_cpy = KH * KW * ggml_type_size(dst->type); + + for (int c = 0; c < IC; c++) { + cur_permute_buffer = (char*)tmp_permute_buffer + offset + + KH * KW * c * ggml_type_size(dst->type); + cur_dst_buffer = (char*)dst->data + + c * KH * KW * n_step_w * ggml_type_size(dst->type); + + for (int i = 0; i < n_step_w; i++) { + ACL_CHECK(aclrtMemcpyAsync( + cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + cur_dst_buffer = + (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type); + cur_permute_buffer = (char*)cur_permute_buffer + + KH * KW * IC * ggml_type_size(dst->type); + } + } + } else { + offset = KH * KW * n_step_w * + ggml_type_size(dst->type); // equal to ggml_nbytes(dst) + ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, + (char*)tmp_permute_buffer + offset, offset, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + } + + // release + ACL_CHECK(aclDestroyTensor(tmp_permute_tensor)); +} + void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src0 = dst->src[0]; // kernel ggml_tensor* src1 = dst->src[1]; // input @@ -1320,21 +1441,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + GGML_TENSOR_BINARY_OP_LOCALS; + + // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D + // im2col and do post-processing to restore it to 1D. + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1; const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1; const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; - const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1; - GGML_TENSOR_BINARY_OP_LOCALS; - - const int64_t N = is_2D ? ne13 : ne12; - const int64_t IC = is_2D ? ne12 : ne11; - - const int64_t KH = is_2D ? ne01 : 1; + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t KH = ne01; const int64_t KW = ne00; + const int64_t IW = ne10; const int64_t OH = is_2D ? ne2 : 1; const int64_t OW = ne1; @@ -1342,9 +1465,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH] + // memory allocated increased to 3x when is_2D == false + const int64_t n_bytes_factor = is_2D ? 1 : 3; + + // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor] aclTensor* acl_src1 = ggml_cann_create_tensor(src1); - int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N}; + int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N}; size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; tmp_im2col_nb[0] = ggml_type_size(src1->type); @@ -1356,8 +1482,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * // dst.elemcount. ggml_cann_pool_alloc im2col_allocator( - ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1)); + ctx.pool(), + ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor); void* tmp_im2col_buffer = im2col_allocator.get(); + aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor( tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb, @@ -1380,8 +1508,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { paddings, strides, tmp_im2col_tensor, &workspaceSize, &executor)); + ggml_cann_pool_alloc workspace_allocator(ctx.pool()); if (workspaceSize > 0) { - ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspace_allocator.alloc(workspaceSize); workspaceAddr = workspace_allocator.get(); } @@ -1391,9 +1520,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // Cast if dst is f16. aclTensor* tmp_cast_tensor = nullptr; ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool()); + void* tmp_cast_buffer = nullptr; if (src1->type != dst->type) { - tmp_cast_allocator.alloc(ggml_nbytes(dst)); - void* tmp_cast_buffer = tmp_cast_allocator.get(); + tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + tmp_cast_buffer = tmp_cast_allocator.get(); size_t temp_cast_nb[GGML_MAX_DIMS - 1]; temp_cast_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { @@ -1408,24 +1538,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_type_mapping(dst->type)); } - // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] - int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; - size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); - - int64_t permute_dim[] = {0, 2, 1}; - if (src1->type != dst->type) { - aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + // post-processing + if (is_2D) { + ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor); } else { - aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + std::vector<int64_t> im2col_op_params = { + KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor}; + ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor, im2col_op_params); } // release ACL_CHECK(aclDestroyTensor(acl_src1)); ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor)); ACL_CHECK(aclDestroyTensor(tmp_cast_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyIntArray(kernel_size)); ACL_CHECK(aclDestroyIntArray(dilations)); ACL_CHECK(aclDestroyIntArray(paddings)); @@ -2219,7 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2352,21 +2479,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, * @param dst The destination tensor where the result of the matrix * multiplication will be stored. */ -static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { +static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + const enum ggml_type type) { ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src1 = dst->src[1]; // input // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC // is regarded as batch. weight need transpose. int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; - size_t weight_elem_size = sizeof(uint8_t); - size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; + float weight_elem_size; + if (type == GGML_TYPE_Q4_0) { + weight_elem_size = float(sizeof(uint8_t)) / 2; + } + else if (type == GGML_TYPE_Q8_0) { + weight_elem_size = float(sizeof(uint8_t)); + } + else { + GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); + } + float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; + // size of one matrix is element_size * height * width. size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1]; size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; // scale stored at the end of weight. Also need transpose. + GGML_ASSERT(QK4_0 == QK8_0); int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0}; size_t scale_elem_size = sizeof(uint16_t); size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, @@ -2381,10 +2520,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; + ggml_cann_pool_alloc input_alloctor(ctx.pool()); if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - ggml_cann_pool_alloc input_alloctor( - ctx.pool(), ggml_nelements(src1) * input_elem_size); + input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); input_buffer = input_alloctor.get(); int64_t* input_cast_ne = src1->ne; @@ -2430,8 +2569,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2); aclTensor* acl_weight_tensor = ggml_cann_create_tensor( - (char*)src0->data + batch0 * weight_stride, ACL_INT8, - weight_elem_size, weight_ne, weight_nb, 2); + (char*)src0->data + batch0 * weight_stride, + ggml_cann_type_mapping(type), weight_elem_size, weight_ne, + weight_nb, 2); aclTensor* acl_scale_tensor = ggml_cann_create_tensor( scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 2); @@ -2485,14 +2625,12 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { case GGML_TYPE_F16: ggml_cann_mat_mul_fp(ctx, dst); break; - // case GGML_TYPE_Q4_0: - // ggml_cann_mul_mat_q4_0(ctx, dst); - // break; + case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: - ggml_cann_mul_mat_q8_0(ctx, dst); + ggml_cann_mul_mat_quant(ctx, dst, type); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } |