From a8e2e8d76a5f33eb9894ce8adba169f807ac00e3 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Tue, 19 Oct 2021 12:27:36 -0700 Subject: [PATCH] hipify tensor/transpose.cc and tensor/transpose.h (#9397) --- .../core/providers/cuda/tensor/transpose.cc | 4 +- .../core/providers/rocm/tensor/transpose.cc | 247 ------------------ .../core/providers/rocm/tensor/transpose.h | 33 --- tools/ci_build/amd_hipify.py | 3 +- 4 files changed, 3 insertions(+), 284 deletions(-) delete mode 100644 onnxruntime/core/providers/rocm/tensor/transpose.cc delete mode 100644 onnxruntime/core/providers/rocm/tensor/transpose.h diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc index b929889982..80fd55319b 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.cc +++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "transpose.h" -#include "transpose_impl.h" +#include "core/providers/cuda/tensor/transpose.h" +#include "core/providers/cuda/tensor/transpose_impl.h" #include "core/providers/cpu/tensor/utils.h" #include "core/providers/cuda/shared_inc/fpgeneric.h" diff --git a/onnxruntime/core/providers/rocm/tensor/transpose.cc b/onnxruntime/core/providers/rocm/tensor/transpose.cc deleted file mode 100644 index ff0a6b4df0..0000000000 --- a/onnxruntime/core/providers/rocm/tensor/transpose.cc +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/rocm/tensor/transpose.h" -#include "core/providers/rocm/tensor/transpose_impl.h" -#include "core/providers/cpu/tensor/utils.h" -#include "core/providers/rocm/shared_inc/fpgeneric.h" - -namespace onnxruntime { -namespace rocm { - -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - Transpose, - kOnnxDomain, - 1, 12, - kRocmExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), - Transpose); - -ONNX_OPERATOR_KERNEL_EX( - Transpose, - kOnnxDomain, - 13, - kRocmExecutionProvider, - (*KernelDefBuilder::Create()) - .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), - Transpose); - -// special case acceleration using rocblas matrix transpose -static std::tuple TryTransposeWithRocblas(const std::vector& perm, const TensorShape& input_shape) { - int M = 0; - int N = 0; - - if (perm.size() == 4 && input_shape[0] == 1 && perm[0] == 0) { - // NCHW <-> NHWC when N == 1 - if ((perm[1] == 2 && perm[2] == 3 && perm[3] == 1) || - (perm[1] == 3 && perm[2] == 1 && perm[3] == 2)) { - if (perm[1] == 2) { - M = gsl::narrow(input_shape[1]); - N = gsl::narrow(input_shape[2] * input_shape[3]); - } else { - M = gsl::narrow(input_shape[1] * input_shape[2]); - N = gsl::narrow(input_shape[3]); - } - } - } else if (perm.size() == 2 && perm[1] == 0 && perm[0] == 1) { - // 2D matrix transpose - M = gsl::narrow(input_shape[0]); - N = gsl::narrow(input_shape[1]); - } - - return std::make_tuple(M, N); -} - -template -Status TransposeWithRocblas(hipStream_t stream, rocblas_handle rocblas_handle, const Tensor& input, Tensor& output, int M, int N) { - typedef typename ToHipType::MappedType HipT; - HipT one = ToHipType::FromFloat(1.0f); - HipT zero = ToHipType::FromFloat(0.0f); - const HipT* input_data = reinterpret_cast(input.Data()); - HipT* output_data = reinterpret_cast(output.MutableData()); - ROCBLAS_RETURN_IF_ERROR( - rocblasTransposeHelper(stream, - rocblas_handle, - rocblas_operation_transpose, rocblas_operation_transpose, M, N, - &one, - input_data, - N, - &zero, - input_data, - N, - output_data, - M)); - return Status::OK(); -} - -Status Transpose::DoTranspose(const Transpose& transpose_kernel, - const std::vector& permutations, const Tensor& input, Tensor& output) { - return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.Stream(), transpose_kernel.RocblasHandle(), permutations, input, output); -} - -Status Transpose::DoTranspose(const hipDeviceProp_t& prop, - hipStream_t stream, - const rocblas_handle rocblas_handle, - const std::vector& permutations, const Tensor& input, Tensor& output, - const TensorShape* input_shape_override) { - // special case when there is a dim value of 0 in the shape. - if (output.Shape().Size() == 0) - return Status::OK(); - - const std::vector& input_dims = input_shape_override ? input_shape_override->GetDims() : input.Shape().GetDims(); - const std::vector& output_dims = output.Shape().GetDims(); - auto rank = static_cast(input_dims.size()); - - // flatten the adjacent dimensions which are contiguous - // for example: permutations[0, 2, 3, 1] -> [0, 2, 1], permutations[0, 3, 1, 2] -> [0, 2, 1] - auto new_rank = rank; - std::vector new_permutations(permutations); - std::vector new_input_dims(input_dims); - std::vector new_output_dims(output_dims); - - for (auto i = rank - 1; i > 0; i--) { - auto curr = new_permutations[i]; - auto prev = new_permutations[i - 1]; - if (prev + 1 == curr) { - // all dims bigger than curr need to be reduced by 1 due to the merging. - for (auto j = 0; j < new_rank; j++) { - if (new_permutations[j] > curr) { - new_permutations[j] -= 1; - } - } - for (auto j = i + 1; j < new_rank; j++) { - new_permutations[j - 1] = new_permutations[j]; - } - - // update input dims - new_input_dims[prev] *= new_input_dims[curr]; - new_input_dims[curr] = 1; - for (auto j = static_cast(curr + 1); j < new_rank; j++) { - new_input_dims[j - 1] = new_input_dims[j]; - } - new_input_dims[new_rank - 1] = 1; - - // update output dims - new_output_dims[i - 1] *= new_output_dims[i]; - new_output_dims[i] = 1; - for (auto j = i + 1; j < new_rank; j++) { - new_output_dims[j - 1] = new_output_dims[j]; - } - new_output_dims[new_rank - 1] = 1; - - new_rank--; - } - } - new_permutations.resize(new_rank); - new_input_dims.resize(new_rank); - new_output_dims.resize(new_rank); - - auto element_type = input.GetElementType(); - size_t element_size = input.DataType()->Size(); - if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT || - element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE || - element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) { - auto mn = TryTransposeWithRocblas(new_permutations, new_input_dims); - int M = std::get<0>(mn); - int N = std::get<1>(mn); - if (M != 0 && N != 0) { - if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); - } else if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); - } else { - return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); - } - } - } - - // Transpose021 has a specialized Transpose3DImpl kernel - dim3 grid_size, block_size; - if (CanDoTranspose3D(prop, new_rank, new_input_dims, new_permutations, grid_size, block_size)) { - TensorPitches new_input_strides(new_input_dims); - return Transpose3DImpl(stream, element_size, new_input_dims, new_input_strides, - input.DataRaw(), output.MutableDataRaw(), output.Shape().Size(), grid_size, block_size); - } - - // 3D-Transpose can treated as a special case of 4D-Transpose with first dimension being 1. - if (new_rank == 3) { - new_permutations[0]++; - new_permutations[1]++; - new_permutations[2]++; - new_permutations.insert(new_permutations.begin(), 0); - new_input_dims.insert(new_input_dims.begin(), 1); - new_output_dims.insert(new_output_dims.begin(), 1); - new_rank = 4; - } - - TensorPitches new_input_strides(new_input_dims); - TensorPitches new_output_strides(new_output_dims); - TArray input_shape(new_input_dims); - TArray tmp_input_strides(new_input_strides); - - if (CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim( - prop, element_size, new_rank, new_input_dims, new_permutations, - grid_size, block_size)) { - TArray tmp_output_strides(new_rank); - for (auto i = 0; i < new_rank; i++) { - tmp_output_strides[static_cast(new_permutations[i])] = new_output_strides[i]; - } - return Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim( - stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), - tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size()), - grid_size, block_size); - } else if (CanDoTranspose4DParallelizeOneElementPerThread( - prop, element_size, new_rank, new_input_dims, new_permutations, grid_size, block_size)) { - // Trying to see if we can still do (best effort) more optimized transposing - // for the 4-D case before falling back to the generic case - TArray tmp_output_strides(new_rank); - for (auto i = 0; i < new_rank; i++) { - tmp_output_strides[static_cast(new_permutations[i])] = new_output_strides[i]; - } - return Transpose4DParallelizeOneElementPerThread( - stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), - tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size()), - grid_size, block_size); - } - - // General cases - TArray input_strides(new_rank); - for (auto i = 0; i < new_rank; i++) { - input_strides[i] = new_input_strides[new_permutations[i]]; - } - - TArray output_strides(new_rank); - for (auto i = 0; i < new_rank; i++) { - output_strides[i] = fast_divmod(gsl::narrow_cast(new_output_strides[i])); - } - - auto status = TransposeImpl(stream, element_size, new_rank, input_strides, input.DataRaw(), - output_strides, output.MutableDataRaw(), output.Shape().Size()); - - return status; -} - -Status Transpose::ComputeInternal(OpKernelContext* ctx) const { - const Tensor* X_ptr = ctx->Input(0); - if (X_ptr == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); - const Tensor& X = *X_ptr; - const TensorShape& input_shape = X.Shape(); - const std::vector& input_dims = input_shape.GetDims(); - int32_t rank = gsl::narrow_cast(input_dims.size()); - - std::vector output_dims(rank); - std::vector default_perm(rank); - const std::vector* p_perm = nullptr; - const auto& status = ComputeOutputShape(X, output_dims, default_perm, p_perm); - if (!status.IsOK()) - return status; - - TensorShape output_shape{output_dims}; - Tensor* Y = ctx->Output(0, output_shape); - - return DoTranspose(this->GetDeviceProp(), this->Stream(), this->RocblasHandle(), *p_perm, X, *Y); -} - -} // namespace rocm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/rocm/tensor/transpose.h b/onnxruntime/core/providers/rocm/tensor/transpose.h deleted file mode 100644 index 8cbcdb4679..0000000000 --- a/onnxruntime/core/providers/rocm/tensor/transpose.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "core/providers/shared_library/provider_api.h" -#include "gsl/gsl" -#include "core/providers/rocm/rocm_kernel.h" -#include "core/providers/cpu/tensor/transpose.h" - -namespace onnxruntime { -namespace rocm { - -class Transpose final : public RocmKernel, public TransposeBase { - public: - Transpose(const OpKernelInfo& info) : RocmKernel(info), TransposeBase(info) { - } - - Status ComputeInternal(OpKernelContext* context) const override; - - static Status DoTranspose(const Transpose& transpose_kernel, - const std::vector& permutations, const Tensor& input, Tensor& output); - - // `input_shape_override` (if provided) overrides the shape of `input` for compute purposes - static Status DoTranspose(const hipDeviceProp_t& prop, - hipStream_t stream, - const rocblas_handle rocblas_handle, - const std::vector& permutations, - const Tensor& input, Tensor& output, const TensorShape* input_shape_override = nullptr); -}; - -} // namespace rocm -} // namespace onnxruntime diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index 55ad1bf5ac..f9e40965e0 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -143,8 +143,6 @@ provider_excluded_files = [ 'tensor/resize.h', 'tensor/resize_impl.cu', 'tensor/resize_impl.h', - 'tensor/transpose.cc', - 'tensor/transpose.h', 'tensor/upsample.cc', 'tensor/upsample.h', 'tensor/upsample_impl.cu', @@ -235,6 +233,7 @@ def hipify(src_file_path, dst_file_path): s = s.replace('hipblasCreate', 'rocblas_create_handle') s = s.replace('hipblasDestroy', 'rocblas_destroy_handle') s = s.replace('hipblasSetStream', 'rocblas_set_stream') + s = s.replace('HIPBLAS_OP_T', 'rocblas_operation_transpose') s = s.replace('RegisterCudaContribKernels', 'RegisterRocmContribKernels') s = s.replace('cudaEvent', 'hipEvent')