From add631410a4b671d622a8bf5872e7da41aee7905 Mon Sep 17 00:00:00 2001 From: mindest <30493312+mindest@users.noreply.github.com> Date: Wed, 20 Jul 2022 13:13:02 +0800 Subject: [PATCH] [ROCm] Re-enable ReduceL1, L2 and related tests (#12209) Re-enable ReduceL1,L2 and related tests --- .../providers/rocm/reduction/reduction_ops.cc | 6 +- .../providers/rocm/reduction/reduction_ops.h | 275 ------------------ tools/ci_build/amd_hipify.py | 1 - .../github/pai/pai-excluded-tests.txt | 30 +- 4 files changed, 10 insertions(+), 302 deletions(-) delete mode 100644 onnxruntime/core/providers/rocm/reduction/reduction_ops.h diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index f865265f32..9591b8b83f 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -364,7 +364,7 @@ template Status ReduceKernel::ReduceKernelShared& axes, + gsl::span axes, PrepareReduceMetadata& prepare_reduce_metadata, const TensorShape* input_shape_override) { ORT_ENFORCE(nullptr != X); @@ -434,7 +434,7 @@ Status PrepareForReduce(const Tensor* X, template Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op, - const gsl::span& axes, + gsl::span axes, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override) { typedef typename ToHipType::MappedType HipT; @@ -663,7 +663,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr &one, input_tensor, temp_X.get(), &zero, output_tensor, temp_output.get())); - Impl_Cast(stream, temp_output.get(), reinterpret_cast(output.template MutableData()), output_count); + Impl_Cast(stream, temp_output.get(), reinterpret_cast(output.template MutableData()), output_count); } else { MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h deleted file mode 100644 index 57169ce747..0000000000 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h +++ /dev/null @@ -1,275 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/common/optional.h" -#include "core/providers/rocm/rocm_kernel.h" -#include "core/providers/cpu/reduction/reduction_ops.h" -#include "core/providers/rocm/reduction/reduction_functions.h" - -namespace onnxruntime { -namespace rocm { - -namespace ReductionOps { - -// Implementation that holds the core logic of reduction op processing -// `input_shape_override` is the input shape for compute purposes (if provided) - -template -std::unique_ptr ReduceCompute(ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, AllocatorPtr allocator, - const Tensor& input, gsl::span axes, - bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, - bool fast_reduction, const TensorShape* input_shape_override = nullptr); - -} // namespace ReductionOps - -// Holds some metadata that will be used during actual reduction op compute time -struct PrepareReduceMetadata { - int64_t input_count; - int64_t output_count; - // This holds the output dims without any reduced dims squeezed (even if keep_dims == 1) - TensorShapeVector output_dims; - // This holds the output dims with with reduced dims squeezed (if keep_dims == 1) - TensorShapeVector squeezed_output_dims; - TensorShapeVector input_dims_miopen; - TensorShapeVector output_dims_miopen; -}; - -template -class ReduceKernel : public RocmKernel, public ReduceKernelBase { - protected: - ReduceKernel( - const OpKernelInfo& info, - optional keep_dims_override = {}) - : RocmKernel(info), - ReduceKernelBase(info, keep_dims_override), - calculate_log_(false), - calculate_sqt_(false), - log_sum_exp_(false), - fast_reduction_(false) { - // We need to cast away the const as PerThreadMiopenHandle() is currently a non-const method - // TODO: Clean up the ROCMExecutionProvider interface to avoid this - rocm_ep_ = const_cast(static_cast(info.GetExecutionProvider())); - } - - // Only Max Min need to set ReduceTensorIndices MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES as per miopen library manual - // Only Max Min will have indices output, need to set the indices to nullptr for other ops - template - Status ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const; - - // Used by ReduceSumTraining which will have axes as input - template - Status ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const; - - template - Status ReduceKernelShared( - const T* X, - const TensorShape& input_shape, - OutT* Y, - const TensorShape& output_shape, - miopenReduceTensorOp_t miopen_reduce_op, - TensorShapeVector& output_dims) const; - - using ReduceKernelBase::axes_; - using ReduceKernelBase::keepdims_; - using ReduceKernelBase::noop_with_empty_axes_; - - bool calculate_log_; - bool calculate_sqt_; - bool log_sum_exp_; - // Indicates if this reduction can be delegated to our highly-optimized reduction kernels. - // Those efficient kernels are defined/implemented in reduction_functions.h/.cu. - bool fast_reduction_; - - // We need to access to the ROCM EP instance to get the miopen handle - ROCMExecutionProvider* rocm_ep_; -}; - -template -class ArgMax final : public ReduceKernel { - public: - ArgMax(const OpKernelInfo& info) : ReduceKernel(info) {} - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MAX); - } -}; - -template -class ArgMin final : public ReduceKernel { - public: - ArgMin(const OpKernelInfo& info) : ReduceKernel(info) {} - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MIN); - } -}; - -template -class ReduceL1 final : public ReduceKernel { - public: - ReduceL1(const OpKernelInfo& info) : ReduceKernel(info) {} - - Status ComputeInternal(OpKernelContext* ctx) const override { - //return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_NORM1); - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MIOpen does not yet support reduce norm1."); - } -}; - -template -class ReduceL2 final : public ReduceKernel { - public: - ReduceL2(const OpKernelInfo& info) : ReduceKernel(info) {} - - Status ComputeInternal(OpKernelContext* ctx) const override { - //return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_NORM2); - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MIOpen does not yet support reduce norm2."); - } -}; - -template -class ReduceMax final : public ReduceKernel { - public: - ReduceMax(const OpKernelInfo& info) : ReduceKernel(info) {} - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MAX); - } -}; - -template -class ReduceMean final : public ReduceKernel { - public: - ReduceMean(const OpKernelInfo& info) : ReduceKernel(info) { - fast_reduction_ = true; - } - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_AVG); - } -}; - -template -class ReduceMin final : public ReduceKernel { - public: - ReduceMin(const OpKernelInfo& info) : ReduceKernel(info) {} - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MIN); - } -}; - -template -class ReduceProd final : public ReduceKernel { - public: - ReduceProd(const OpKernelInfo& info) : ReduceKernel(info) {} - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MUL); - } -}; - -template -class ReduceSum final : public ReduceKernel { - public: - ReduceSum(const OpKernelInfo& info) : ReduceKernel(info) { - fast_reduction_ = true; - } - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_ADD); - } -}; - -template -class ReduceLogSum final : public ReduceKernel { - public: - ReduceLogSum(const OpKernelInfo& info) : ReduceKernel(info) { - ReduceKernel::calculate_log_ = true; - fast_reduction_ = true; - } - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_ADD); - } -}; - -template -class ReduceSumSquare final : public ReduceKernel { - public: - ReduceSumSquare(const OpKernelInfo& info) : ReduceKernel(info) { - ReduceKernel::calculate_sqt_ = true; - fast_reduction_ = true; - } - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_ADD); - } -}; - -template -class ReduceLogSumExp final : public ReduceKernel { - public: - ReduceLogSumExp(const OpKernelInfo& info) : ReduceKernel(info) { - ReduceKernel::log_sum_exp_ = true; - } - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_ADD); - } -}; - -Status PrepareForReduce(const Tensor* X, - bool keepdims, - const gsl::span& axes, - PrepareReduceMetadata& prepare_reduce_metadata, - const TensorShape* input_shape_override = nullptr); - -template -Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, - /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op, - const gsl::span& axes, - bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, - const TensorShape* input_shape_override = nullptr); - -// ROCM's reduction descriptor miopenReduceTensorDescriptor_t is a pointer so -// it's safer to wrap it with automatically memory deleter as MiopenReduceDescriptor. -// An implicit caster from MiopenReduceDescriptor to miopenReduceTensorDescriptor_t -// is implemented below, so ROCM can seamlessly work. -class MiopenReduceDescriptor final { - public: - MiopenReduceDescriptor() : desc_(nullptr) { - } - - ~MiopenReduceDescriptor() { - if (desc_ != nullptr) { - miopenDestroyReduceTensorDescriptor(desc_); - desc_ = nullptr; - } - } - - MiopenReduceDescriptor(const MiopenReduceDescriptor&) = delete; - MiopenReduceDescriptor& operator=(const MiopenReduceDescriptor&) = delete; - - Status Set(miopenReduceTensorOp_t op, miopenDataType_t type, miopenReduceTensorIndices_t indices) { - if (!desc_) - MIOPEN_RETURN_IF_ERROR(miopenCreateReduceTensorDescriptor(&desc_)); - - MIOPEN_RETURN_IF_ERROR(miopenSetReduceTensorDescriptor( - desc_, - op, - type, - MIOPEN_PROPAGATE_NAN, - indices, - MIOPEN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported. - return Status::OK(); - } - - operator miopenReduceTensorDescriptor_t() const { return desc_; } - - private: - miopenReduceTensorDescriptor_t desc_; -}; - -} // namespace rocm -} // namespace onnxruntime diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index d55ec6eb61..4bf2cdbc1f 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -128,7 +128,6 @@ provider_excluded_files = [ "nn/pool.cc", "nn/pool.h", "reduction/reduction_ops.cc", - "reduction/reduction_ops.h", "rnn/cudnn_rnn_base.cc", "rnn/cudnn_rnn_base.h", "rnn/gru.cc", diff --git a/tools/ci_build/github/pai/pai-excluded-tests.txt b/tools/ci_build/github/pai/pai-excluded-tests.txt index 294674c2e5..06936399a0 100644 --- a/tools/ci_build/github/pai/pai-excluded-tests.txt +++ b/tools/ci_build/github/pai/pai-excluded-tests.txt @@ -1,46 +1,30 @@ CudaKernelTest.NegativeLogLikelihoodLoss_TinySizeTensor CudaKernelTest.NegativeLogLikelihoodLoss_SmallSizeTensor CudaKernelTest.NegativeLogLikelihoodLoss_MediumSizeTensor -ReductionOpTest.ReductionVariationTest -ReductionOpTest.ReduceL1_default_axes_keepdims -ReductionOpTest.ReduceL1_do_not_keep_dims -ReductionOpTest.ReduceL1_do_not_keep_dims_2 -ReductionOpTest.ReduceL1_keepdims -ReductionOpTest.ReduceL1 -ReductionOpTest.ReduceL1_int32 -ReductionOpTest.ReduceL10DTensor -ReductionOpTest.ReduceL2_default_axes_keepdims -ReductionOpTest.ReduceL2_default_axes_do_not_keep_dims -ReductionOpTest.ReduceL2_do_not_keepdims -ReductionOpTest.ReduceL2_do_not_keepdims_2 -ReductionOpTest.ReduceL2_keepdims -ReductionOpTest.ReduceL2 -ReductionOpTest.ReduceL2_int32 -ReductionOpTest.ReduceL20DTensor +ReductionOpTest.ReductionVariationTest ReductionOpTest.ReduceLogSumExp_default_axes_keepdims_double ReductionOpTest.ReduceLogSumExp_default_axes_do_not_keep_dims_double ReductionOpTest.ReduceLogSumExp_do_not_keepdims_double ReductionOpTest.ReduceLogSumExp_do_not_keepdims_2_double ReductionOpTest.ReduceLogSumExp_keepdims_double ReductionOpTest.ReduceLogSumExp_double -ReductionOpTest.ReduceMax_double +ReductionOpTest.ReduceMax_double ReductionOpTest.ReduceMean_default_axes_keepdims_double ReductionOpTest.ReduceMean_default_axes_do_not_keep_dims_double ReductionOpTest.ReduceMean_do_not_keepdims_double ReductionOpTest.ReduceMean_do_not_keepdims_2_double ReductionOpTest.ReduceMean_keepdims_double ReductionOpTest.ReduceMean_double -ReductionOpTest.ReduceMean0DTensor_double -ReductionOpTest.ReduceMin_double -ReductionOpTest.ReduceSum_double -ReductionOpTest.ReduceSumSquare_double +ReductionOpTest.ReduceMean0DTensor_double +ReductionOpTest.ReduceMin_double +ReductionOpTest.ReduceSum_double +ReductionOpTest.ReduceSumSquare_double ReductionOpTest.ReduceInfMax_double ReductionOpTest.ReduceInfMin_double ReductionOpTest.ReduceInfLogSumExp_double -GatherOpTest.Gather_invalid_index_cpu +GatherOpTest.Gather_invalid_index_cpu Scatter.InvalidIndex GradientCheckerTest.AddGrad GradientCheckerTest.SubGrad GradientCheckerTest.MulGrad -GradientCheckerTest.ReduceL2Grad GradientCheckerTest.DivGrad