diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc new file mode 100644 index 0000000000..a2f8cc7a85 --- /dev/null +++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc @@ -0,0 +1,451 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include "core/providers/rocm/nn/conv.h" +#include "core/providers/rocm/rocm_common.h" + +namespace onnxruntime { +namespace contrib { +namespace rocm { + +namespace { + +// Copied from hipDNN/library/src/hcc_detail/hipdnn_miopen.cpp +miopenStatus_t _miopenAddTensor( + miopenHandle_t handle, + const void *alpha, + const miopenTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const miopenTensorDescriptor_t cDesc, + void *C, + const void* zero_scalar) +{ + const miopenTensorOp_t tensorOp = miopenTensorOpAdd; + // opnd2 = Add ( 0.0 * opnd0, alpha * opnd1 ) + alpha * opnd2 + return miopenOpTensor(handle, tensorOp, + zero_scalar, cDesc, C, + alpha, aDesc, A, + alpha, cDesc, C); +} + +} + +template +struct FNVHash { + uint32_t GetValue() const { return value_; } + + void Hash(const void* in_ptr, size_t nbytes) { + auto ptr = reinterpret_cast(in_ptr); + for (size_t i = 0; i < nbytes; ++i) { + value_ ^= ptr[i]; + value_ *= PRIME; + } + } + + template ::value, size_t>::type = 0> + FNVHash& operator<<(const T& pod) { + Hash(&pod, sizeof(pod)); + return *this; + } + + template + FNVHash& operator<<(const std::vector& pod_array) { + for (const auto& pod : pod_array) { + (*this) << pod; + } + return *this; + } + + void HashTensor(miopenTensorDescriptor_t tdesc) { + int size = 0; + miopenGetTensorDescriptorSize(tdesc, &size); + (*this) << size; + std::vector dims(size); + std::vector strides(size); + miopenDataType_t dtype; + miopenGetTensorDescriptor(tdesc, &dtype, dims.data(), strides.data()); + (*this) << dtype; + (*this) << dims; + (*this) << strides; + } + + void HashConvolutionDescriptor(miopenConvolutionDescriptor_t cdesc) { + int spatial_dim = 1; + // Current MIOpen doesn't provide API to probe the dimension of a + // miopenConvolutionDescriptor_t, so we have to guess. + // This algorithm is based on a specific behavior of miopenGetConvolutionNdDescriptor, + // which fails when requestedSpatialDim > the convolution's spatial dimension + std::vector spatial_dims; + std::vector pads; + std::vector strides; + std::vector dilations; + miopenConvolutionMode_t mode; + while (true) { + spatial_dims.resize(spatial_dim); + pads.resize(spatial_dim); + strides.resize(spatial_dim); + dilations.resize(spatial_dim); + + if (miopenStatusSuccess != miopenGetConvolutionNdDescriptor(cdesc, + spatial_dim, + spatial_dims.data(), + pads.data(), + strides.data(), + dilations.data(), + &mode)) { + // Remove the extra dimension + spatial_dims.resize(spatial_dim - 1); + pads.resize(spatial_dim - 1); + strides.resize(spatial_dim - 1); + dilations.resize(spatial_dim - 1); + (*this) << spatial_dim; + (*this) << spatial_dims; + (*this) << pads; + (*this) << strides; + (*this) << dilations; + break; + } + spatial_dim += 1; + ORT_ENFORCE(spatial_dim < 10, + "miopenGetConvolutionNdDescriptor is supposed to fail before spatial_dim gets to ", + spatial_dim); + } + } + private: + uint32_t value_ = BASIS; +}; + +template +class FusedConv : public onnxruntime::rocm::Conv { + public: + using Base = onnxruntime::rocm::Conv; + FusedConv(const OpKernelInfo& info) : onnxruntime::rocm::Conv(info) { + std::string activation; + if (info.GetAttr("activation", &activation) == Status::OK() && + MapMode(activation) == Status::OK() && + miopenCreateActivationDescriptor(&activation_desc_) == miopenStatusSuccess) { + status_ = miopenSetActivationDescriptor(activation_desc_, + activation_mode_, + 0.0, 0.0, 0.0); + } + } + + ORT_DISALLOW_COPY_AND_ASSIGNMENT(FusedConv); + + ~FusedConv() { + if (activation_desc_) { + miopenDestroyActivationDescriptor(activation_desc_); + status_ = miopenStatusNotInitialized; + activation_desc_ = nullptr; + } + } + + Status ComputeInternal(OpKernelContext* context) const override { + MIOPEN_RETURN_IF_ERROR(status_); + std::lock_guard lock(Base::s_.mutex); + + ORT_RETURN_IF_ERROR(Base::UpdateState(context, true)); + if (Base::s_.Y->Shape().Size() == 0) { + return Status::OK(); + } + + bool has_z = nullptr != Base::s_.z_data; + bool has_b = nullptr != Base::s_.b_data; + auto factory = [this](FusedConvFusionData& fusion) { + return this->DoCreateFusionDesc(fusion); + }; + auto& cached_item = plan_cache_.FindOrCreateFusionPlanCache(Hash(), + factory); + bool should_try_fusion_api = cached_item.Validate(Base::MiopenHandle()); + + typedef typename onnxruntime::rocm::ToHipType::MappedType HipT; + const auto alpha = onnxruntime::rocm::Consts::One; + const auto beta = onnxruntime::rocm::Consts::Zero; + IAllocatorUniquePtr workspace = Base::GetWorkSpace(); + miopenStatus_t fusion_status = miopenStatusNotInitialized; + + if (should_try_fusion_api) { + auto& fusion_info = *cached_item.fusion; + MIOPEN_RETURN_IF_ERROR(miopenSetOpArgsConvForward(fusion_info.fusion_args, + fusion_info.conv_op, + &alpha, + &beta, + Base::s_.w_data)); + if (has_z) { + MIOPEN_RETURN_IF_ERROR(miopenSetOpArgsBiasForward(fusion_info.fusion_args, + fusion_info.bias_z_op, + &alpha, + &beta, + Base::s_.z_data)); + } + if (has_b) { + MIOPEN_RETURN_IF_ERROR(miopenSetOpArgsBiasForward(fusion_info.fusion_args, + fusion_info.bias_b_op, + &alpha, + &beta, + Base::s_.b_data)); + } + if (activation_desc_) { + const float relu_notused = 0.0; + MIOPEN_RETURN_IF_ERROR(miopenSetOpArgsActivForward(fusion_info.fusion_args, + fusion_info.act_op, + &alpha, + &beta, + relu_notused, + relu_notused, + relu_notused)); + } + fusion_status = miopenExecuteFusionPlan(Base::MiopenHandle(), + fusion_info.plan, + Base::s_.x_tensor, + Base::s_.x_data, + Base::s_.y_tensor, + Base::s_.y_data, + fusion_info.fusion_args); + } + if (miopenStatusSuccess != fusion_status) { + MIOPEN_RETURN_IF_ERROR(miopenConvolutionForward(Base::MiopenHandle(), + &alpha, + Base::s_.x_tensor, + Base::s_.x_data, + Base::s_.w_desc, + Base::s_.w_data, + Base::s_.conv_desc, + Base::s_.fwd_algo, + &beta, + Base::s_.y_tensor, + Base::s_.y_data, + workspace.get(), + Base::s_.workspace_bytes)); + if (has_b) { + MIOPEN_RETURN_IF_ERROR(_miopenAddTensor(Base::MiopenHandle(), + &alpha, Base::s_.b_tensor, Base::s_.b_data, + &alpha, Base::s_.y_tensor, Base::s_.y_data, + &beta)); + } + if (has_z) { + MIOPEN_RETURN_IF_ERROR(_miopenAddTensor(Base::MiopenHandle(), + &alpha, Base::s_.z_tensor, Base::s_.z_data, + &alpha, Base::s_.y_tensor, Base::s_.y_data, + &beta)); + } + MIOPEN_RETURN_IF_ERROR(miopenActivationForward(Base::MiopenHandle(), + activation_desc_, + &alpha, + Base::s_.y_tensor, + Base::s_.y_data, + &beta, + Base::s_.y_tensor, + Base::s_.y_data)); + } + if (Base::s_.post_slicing_required) { + ORT_RETURN_IF_ERROR(onnxruntime::rocm::SliceOutUnwantedOutputSection( + this->Stream(), + Base::s_.y_data, + Base::s_.y_dims_with_adjusted_pads, + Base::s_.Y->MutableDataRaw(), + Base::s_.y_dims.GetDims(), + Base::s_.slice_starts, + Base::s_.slice_ends, + Base::s_.slice_axes, + Base::s_.element_size)); + } + return Status::OK(); + } + + private: + Status MapMode(const std::string& activaton_mode) { + if (activaton_mode == "Relu") { + activation_mode_ = miopenActivationMode_t::miopenActivationRELU; + } else { + return Status(common::StatusCategory::ONNXRUNTIME, + common::StatusCode::INVALID_ARGUMENT, + "unsupported conv activation mode"); + } + return Status::OK(); + } + miopenStatus_t status_ = miopenStatusNotInitialized; + miopenActivationMode_t activation_mode_; + miopenActivationDescriptor_t activation_desc_ = nullptr; + + // MIOpen Fusion API + // TODO: create one fusion descriptor shared by multiple FusedConv + // objects + // + // Considerations: + // How to determine two FusedConv objects may share the same fusion + // descriptor? Hashing x_tensor,conv_desc, etc.? + struct FusedConvFusionData { + miopenFusionPlanDescriptor_t plan = nullptr; + miopenFusionOpDescriptor_t conv_op = nullptr; + miopenFusionOpDescriptor_t bias_b_op = nullptr; + miopenFusionOpDescriptor_t bias_z_op = nullptr; + miopenFusionOpDescriptor_t act_op = nullptr; + miopenOperatorArgs_t fusion_args = nullptr; + + // TODO: There is a potential problem. miopenHandle_t may be destroyed and + // re-created later, sharing the same address. Currently there is any way + // to detect it? + mutable std::unordered_set compiled_on; + + FusedConvFusionData(const FusedConvFusionData&) = delete; + FusedConvFusionData& operator= (const FusedConvFusionData&) = delete; + + FusedConvFusionData(FusedConvFusionData&& other) { + *this = std::move(other); + } + FusedConvFusionData& operator=(FusedConvFusionData&& other) { + std::swap(this->plan, other.plan); + std::swap(this->fusion_args, other.fusion_args); + this->conv_op = other.conv_op; + this->bias_b_op = other.bias_b_op; + this->bias_z_op = other.bias_z_op; + this->act_op = other.act_op; + this->compiled_on = std::move(other.compiled_on); + return *this; + } + + FusedConvFusionData() { } + ~FusedConvFusionData() { + if (plan) { + miopenDestroyFusionPlan(plan); + } + if (fusion_args) { + miopenDestroyOperatorArgs(fusion_args); + } + } + }; + + struct FusionPlanCacheItem { + std::unique_ptr fusion; + Status creation_result; + // TODO: Add a timestamp for eviction + // std::chrono::time_point last_access; + + FusionPlanCacheItem() { + } + + miopenStatus_t CompileOnHandle(miopenHandle_t handle) const { + if (!fusion->plan) { + return miopenStatusNotInitialized; + } + auto iter = fusion->compiled_on.find(handle); + if (iter != fusion->compiled_on.end()) { + return miopenStatusSuccess; + } + auto ret = miopenCompileFusionPlan(handle, fusion->plan); + if (miopenStatusSuccess == ret) { + fusion->compiled_on.insert(handle); + } + return miopenStatusSuccess; + } + + bool Validate(miopenHandle_t handle) const { + if (Status::OK() != creation_result) { + return false; + } + if (!fusion || !fusion->plan || !fusion->fusion_args) { + return false; + } + auto compiling_status = CompileOnHandle(handle); + if (miopenStatusSuccess != compiling_status) { + return false; + } + + return true; + } + }; + + struct FusionPlanCache { + mutable OrtMutex mutex; + using HashKey = uint32_t; + std::unordered_map cache_directory_; + + FusionPlanCache() { + } + + FusionPlanCacheItem& FindOrCreateFusionPlanCache(HashKey key, + std::function factory) { + std::lock_guard lock(mutex); + auto iter = cache_directory_.find(key); + if (iter == cache_directory_.end()) { + cache_directory_[key].fusion = std::make_unique(); + cache_directory_[key].creation_result = factory(*cache_directory_[key].fusion); + if (Status::OK() != cache_directory_[key].creation_result) { + cache_directory_[key].fusion.reset(); + } + } + return cache_directory_[key]; + } + }; + + static FusionPlanCache plan_cache_; + + Status DoCreateFusionDesc(FusedConvFusionData& fusion) const { + bool has_z = nullptr != Base::s_.z_data; + bool has_b = nullptr != Base::s_.b_data; + MIOPEN_RETURN_IF_ERROR(miopenCreateFusionPlan(&fusion.plan, + miopenVerticalFusion, + Base::s_.x_tensor)); + MIOPEN_RETURN_IF_ERROR(miopenCreateOperatorArgs(&fusion.fusion_args)); + MIOPEN_RETURN_IF_ERROR(miopenCreateOpConvForward(fusion.plan, + &fusion.conv_op, + Base::s_.conv_desc, + Base::s_.w_desc)); + if (has_z) { + MIOPEN_RETURN_IF_ERROR(miopenCreateOpBiasForward(fusion.plan, + &fusion.bias_z_op, + Base::s_.z_tensor)); + } + if (has_b) { + MIOPEN_RETURN_IF_ERROR(miopenCreateOpBiasForward(fusion.plan, + &fusion.bias_b_op, + Base::s_.b_tensor)); + } + if (activation_desc_) { + MIOPEN_RETURN_IF_ERROR(miopenCreateOpActivationForward(fusion.plan, + &fusion.act_op, + activation_mode_)); + } + return Status::OK(); + } + + uint32_t Hash() const { + FNVHash hash; + bool has_z = nullptr != Base::s_.z_data; + bool has_b = nullptr != Base::s_.b_data; + hash.HashTensor(Base::s_.x_tensor); + hash.HashConvolutionDescriptor(Base::s_.conv_desc); + hash.HashTensor(Base::s_.w_desc); + if (has_z) { + hash.HashTensor(Base::s_.z_tensor); + } + if (has_b) { + hash.HashTensor(Base::s_.b_tensor); + } + if (activation_desc_) { + hash << static_cast(activation_mode_); + } + return hash.GetValue(); + } + +}; + +template +typename FusedConv::FusionPlanCache FusedConv::plan_cache_; + +ONNX_OPERATOR_TYPED_KERNEL_EX( + FusedConv, + kMSDomain, + 1, + float, + kRocmExecutionProvider, + (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), + FusedConv); + +} // namespace rocm +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc index 2966d378bf..09dfd276d5 100644 --- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc @@ -205,7 +205,8 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, // backward compatibility BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/test/contrib_ops/fused_conv_test.cc b/onnxruntime/test/contrib_ops/fused_conv_test.cc index f3c177f342..269bad58da 100644 --- a/onnxruntime/test/contrib_ops/fused_conv_test.cc +++ b/onnxruntime/test/contrib_ops/fused_conv_test.cc @@ -37,7 +37,7 @@ static std::unordered_set providers_except_cpu = { kArmNNExecutionProvider, kRocmExecutionProvider}; -static std::unordered_set providers_except_cpu_cuda = { +static std::unordered_set providers_except_cpu_gpu = { kDnnlExecutionProvider, kOpenVINOExecutionProvider, kNupharExecutionProvider, @@ -49,8 +49,7 @@ static std::unordered_set providers_except_cpu_cuda = { kDmlExecutionProvider, kMIGraphXExecutionProvider, kAclExecutionProvider, - kArmNNExecutionProvider, - kRocmExecutionProvider}; + kArmNNExecutionProvider}; void TestConvOp(const ConvOpAndTestAttributes& attributes, @@ -58,7 +57,7 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, const vector>& input_shapes, const std::initializer_list& expected_output, const vector& expected_output_shape, - const std::unordered_set& excluded_provider_types = providers_except_cpu_cuda, + const std::unordered_set& excluded_provider_types = providers_except_cpu_gpu, bool weight_is_initializer = false, OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, const std::string& err_str = "") { @@ -162,9 +161,9 @@ TEST(FusedConvTest, Conv2D_Bias_Relu) { TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); } -#if defined(USE_CUDA) +#if defined(USE_CUDA) || defined(USE_ROCM) -static std::unordered_set providers_except_cuda = { +static std::unordered_set providers_except_gpu = { kCpuExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider, @@ -177,8 +176,7 @@ static std::unordered_set providers_except_cuda = { kDmlExecutionProvider, kMIGraphXExecutionProvider, kAclExecutionProvider, - kArmNNExecutionProvider, - kRocmExecutionProvider}; + kArmNNExecutionProvider}; TEST(FusedConvTest, Conv2D_Bias_Z_Relu) { ConvOpAndTestAttributes attrs = { @@ -201,7 +199,7 @@ TEST(FusedConvTest, Conv2D_Bias_Z_Relu) { vector Z = {-1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; vector Z_shape = {1, 2, 2, 2}; auto expected_vals = {12.0f, 17.0f, 25.0f, 29.0f, 11.0f, 15.0f, 23.0f, 28.0f}; - TestConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, providers_except_cuda); + TestConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, providers_except_gpu); } #endif