[ROCm] Re-enable ReduceL1, L2 and related tests (#12209)

Re-enable ReduceL1,L2 and related tests
This commit is contained in:
mindest 2022-07-20 13:13:02 +08:00 committed by GitHub
parent 9b6ef17c5f
commit add631410a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 10 additions and 302 deletions

View file

@ -364,7 +364,7 @@ template Status ReduceKernel<true>::ReduceKernelShared<MLFloat16, MLFloat16, MIO
// `input_shape_override` (if provided) is the input shape for compute purposes
Status PrepareForReduce(const Tensor* X,
bool keepdims,
const gsl::span<const int64_t>& axes,
gsl::span<const int64_t> axes,
PrepareReduceMetadata& prepare_reduce_metadata,
const TensorShape* input_shape_override) {
ORT_ENFORCE(nullptr != X);
@ -434,7 +434,7 @@ Status PrepareForReduce(const Tensor* X,
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices>
Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
/*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op,
const gsl::span<const int64_t>& axes,
gsl::span<const int64_t> axes,
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
const TensorShape* input_shape_override) {
typedef typename ToHipType<T>::MappedType HipT;
@ -663,7 +663,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
&one, input_tensor, temp_X.get(),
&zero, output_tensor, temp_output.get()));
Impl_Cast<float, HipT>(stream, temp_output.get(), reinterpret_cast<HipT*>(output.template MutableData<T>()), output_count);
Impl_Cast<float, HipT>(stream, temp_output.get(), reinterpret_cast<HipT*>(output.template MutableData<T>()), output_count);
} else {
MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(
rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes,

View file

@ -1,275 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/optional.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/reduction/reduction_ops.h"
#include "core/providers/rocm/reduction/reduction_functions.h"
namespace onnxruntime {
namespace rocm {
namespace ReductionOps {
// Implementation that holds the core logic of reduction op processing
// `input_shape_override` is the input shape for compute purposes (if provided)
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
std::unique_ptr<Tensor> ReduceCompute(ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, AllocatorPtr allocator,
const Tensor& input, gsl::span<const int64_t> axes,
bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp,
bool fast_reduction, const TensorShape* input_shape_override = nullptr);
} // namespace ReductionOps
// Holds some metadata that will be used during actual reduction op compute time
struct PrepareReduceMetadata {
int64_t input_count;
int64_t output_count;
// This holds the output dims without any reduced dims squeezed (even if keep_dims == 1)
TensorShapeVector output_dims;
// This holds the output dims with with reduced dims squeezed (if keep_dims == 1)
TensorShapeVector squeezed_output_dims;
TensorShapeVector input_dims_miopen;
TensorShapeVector output_dims_miopen;
};
template <bool allow_multi_axes>
class ReduceKernel : public RocmKernel, public ReduceKernelBase<allow_multi_axes> {
protected:
ReduceKernel(
const OpKernelInfo& info,
optional<int64_t> keep_dims_override = {})
: RocmKernel(info),
ReduceKernelBase<allow_multi_axes>(info, keep_dims_override),
calculate_log_(false),
calculate_sqt_(false),
log_sum_exp_(false),
fast_reduction_(false) {
// We need to cast away the const as PerThreadMiopenHandle() is currently a non-const method
// TODO: Clean up the ROCMExecutionProvider interface to avoid this
rocm_ep_ = const_cast<ROCMExecutionProvider*>(static_cast<const ROCMExecutionProvider*>(info.GetExecutionProvider()));
}
// Only Max Min need to set ReduceTensorIndices MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES as per miopen library manual
// Only Max Min will have indices output, need to set the indices to nullptr for other ops
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
Status ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const;
// Used by ReduceSumTraining which will have axes as input
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
Status ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const;
template <typename T, typename OutT, miopenReduceTensorIndices_t ReduceTensorIndices>
Status ReduceKernelShared(
const T* X,
const TensorShape& input_shape,
OutT* Y,
const TensorShape& output_shape,
miopenReduceTensorOp_t miopen_reduce_op,
TensorShapeVector& output_dims) const;
using ReduceKernelBase<allow_multi_axes>::axes_;
using ReduceKernelBase<allow_multi_axes>::keepdims_;
using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;
bool calculate_log_;
bool calculate_sqt_;
bool log_sum_exp_;
// Indicates if this reduction can be delegated to our highly-optimized reduction kernels.
// Those efficient kernels are defined/implemented in reduction_functions.h/.cu.
bool fast_reduction_;
// We need to access to the ROCM EP instance to get the miopen handle
ROCMExecutionProvider* rocm_ep_;
};
template <typename T>
class ArgMax final : public ReduceKernel<false> {
public:
ArgMax(const OpKernelInfo& info) : ReduceKernel<false>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES>(ctx, MIOPEN_REDUCE_TENSOR_MAX);
}
};
template <typename T>
class ArgMin final : public ReduceKernel<false> {
public:
ArgMin(const OpKernelInfo& info) : ReduceKernel<false>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES>(ctx, MIOPEN_REDUCE_TENSOR_MIN);
}
};
template <typename T>
class ReduceL1 final : public ReduceKernel<true> {
public:
ReduceL1(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
//return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_NORM1);
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MIOpen does not yet support reduce norm1.");
}
};
template <typename T>
class ReduceL2 final : public ReduceKernel<true> {
public:
ReduceL2(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
//return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_NORM2);
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MIOpen does not yet support reduce norm2.");
}
};
template <typename T>
class ReduceMax final : public ReduceKernel<true> {
public:
ReduceMax(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MAX);
}
};
template <typename T>
class ReduceMean final : public ReduceKernel<true> {
public:
ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info) {
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_AVG);
}
};
template <typename T>
class ReduceMin final : public ReduceKernel<true> {
public:
ReduceMin(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MIN);
}
};
template <typename T>
class ReduceProd final : public ReduceKernel<true> {
public:
ReduceProd(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MUL);
}
};
template <typename T>
class ReduceSum final : public ReduceKernel<true> {
public:
ReduceSum(const OpKernelInfo& info) : ReduceKernel<true>(info) {
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
}
};
template <typename T>
class ReduceLogSum final : public ReduceKernel<true> {
public:
ReduceLogSum(const OpKernelInfo& info) : ReduceKernel<true>(info) {
ReduceKernel<true>::calculate_log_ = true;
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
}
};
template <typename T>
class ReduceSumSquare final : public ReduceKernel<true> {
public:
ReduceSumSquare(const OpKernelInfo& info) : ReduceKernel<true>(info) {
ReduceKernel<true>::calculate_sqt_ = true;
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
}
};
template <typename T>
class ReduceLogSumExp final : public ReduceKernel<true> {
public:
ReduceLogSumExp(const OpKernelInfo& info) : ReduceKernel<true>(info) {
ReduceKernel<true>::log_sum_exp_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
}
};
Status PrepareForReduce(const Tensor* X,
bool keepdims,
const gsl::span<const int64_t>& axes,
PrepareReduceMetadata& prepare_reduce_metadata,
const TensorShape* input_shape_override = nullptr);
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices>
Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
/*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op,
const gsl::span<const int64_t>& axes,
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
const TensorShape* input_shape_override = nullptr);
// ROCM's reduction descriptor miopenReduceTensorDescriptor_t is a pointer so
// it's safer to wrap it with automatically memory deleter as MiopenReduceDescriptor.
// An implicit caster from MiopenReduceDescriptor to miopenReduceTensorDescriptor_t
// is implemented below, so ROCM can seamlessly work.
class MiopenReduceDescriptor final {
public:
MiopenReduceDescriptor() : desc_(nullptr) {
}
~MiopenReduceDescriptor() {
if (desc_ != nullptr) {
miopenDestroyReduceTensorDescriptor(desc_);
desc_ = nullptr;
}
}
MiopenReduceDescriptor(const MiopenReduceDescriptor&) = delete;
MiopenReduceDescriptor& operator=(const MiopenReduceDescriptor&) = delete;
Status Set(miopenReduceTensorOp_t op, miopenDataType_t type, miopenReduceTensorIndices_t indices) {
if (!desc_)
MIOPEN_RETURN_IF_ERROR(miopenCreateReduceTensorDescriptor(&desc_));
MIOPEN_RETURN_IF_ERROR(miopenSetReduceTensorDescriptor(
desc_,
op,
type,
MIOPEN_PROPAGATE_NAN,
indices,
MIOPEN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported.
return Status::OK();
}
operator miopenReduceTensorDescriptor_t() const { return desc_; }
private:
miopenReduceTensorDescriptor_t desc_;
};
} // namespace rocm
} // namespace onnxruntime

View file

@ -128,7 +128,6 @@ provider_excluded_files = [
"nn/pool.cc",
"nn/pool.h",
"reduction/reduction_ops.cc",
"reduction/reduction_ops.h",
"rnn/cudnn_rnn_base.cc",
"rnn/cudnn_rnn_base.h",
"rnn/gru.cc",

View file

@ -1,46 +1,30 @@
CudaKernelTest.NegativeLogLikelihoodLoss_TinySizeTensor
CudaKernelTest.NegativeLogLikelihoodLoss_SmallSizeTensor
CudaKernelTest.NegativeLogLikelihoodLoss_MediumSizeTensor
ReductionOpTest.ReductionVariationTest
ReductionOpTest.ReduceL1_default_axes_keepdims
ReductionOpTest.ReduceL1_do_not_keep_dims
ReductionOpTest.ReduceL1_do_not_keep_dims_2
ReductionOpTest.ReduceL1_keepdims
ReductionOpTest.ReduceL1
ReductionOpTest.ReduceL1_int32
ReductionOpTest.ReduceL10DTensor
ReductionOpTest.ReduceL2_default_axes_keepdims
ReductionOpTest.ReduceL2_default_axes_do_not_keep_dims
ReductionOpTest.ReduceL2_do_not_keepdims
ReductionOpTest.ReduceL2_do_not_keepdims_2
ReductionOpTest.ReduceL2_keepdims
ReductionOpTest.ReduceL2
ReductionOpTest.ReduceL2_int32
ReductionOpTest.ReduceL20DTensor
ReductionOpTest.ReductionVariationTest
ReductionOpTest.ReduceLogSumExp_default_axes_keepdims_double
ReductionOpTest.ReduceLogSumExp_default_axes_do_not_keep_dims_double
ReductionOpTest.ReduceLogSumExp_do_not_keepdims_double
ReductionOpTest.ReduceLogSumExp_do_not_keepdims_2_double
ReductionOpTest.ReduceLogSumExp_keepdims_double
ReductionOpTest.ReduceLogSumExp_double
ReductionOpTest.ReduceMax_double
ReductionOpTest.ReduceMax_double
ReductionOpTest.ReduceMean_default_axes_keepdims_double
ReductionOpTest.ReduceMean_default_axes_do_not_keep_dims_double
ReductionOpTest.ReduceMean_do_not_keepdims_double
ReductionOpTest.ReduceMean_do_not_keepdims_2_double
ReductionOpTest.ReduceMean_keepdims_double
ReductionOpTest.ReduceMean_double
ReductionOpTest.ReduceMean0DTensor_double
ReductionOpTest.ReduceMin_double
ReductionOpTest.ReduceSum_double
ReductionOpTest.ReduceSumSquare_double
ReductionOpTest.ReduceMean0DTensor_double
ReductionOpTest.ReduceMin_double
ReductionOpTest.ReduceSum_double
ReductionOpTest.ReduceSumSquare_double
ReductionOpTest.ReduceInfMax_double
ReductionOpTest.ReduceInfMin_double
ReductionOpTest.ReduceInfLogSumExp_double
GatherOpTest.Gather_invalid_index_cpu
GatherOpTest.Gather_invalid_index_cpu
Scatter.InvalidIndex
GradientCheckerTest.AddGrad
GradientCheckerTest.SubGrad
GradientCheckerTest.MulGrad
GradientCheckerTest.ReduceL2Grad
GradientCheckerTest.DivGrad