Merge remote-tracking branch 'upstream/master' into DmlDev

This commit is contained in:
ISS Build Account 2020-07-24 12:21:34 +00:00
commit 15e1a1e1ac
9 changed files with 408 additions and 204 deletions

View file

@ -3,9 +3,6 @@
#include "core/providers/cpu/reduction/reduction_ops.h"
#include "core/providers/common.h"
#include "core/util/math_cpuonly.h"
#include "core/providers/cpu/containers.h"
#include "core/platform/threadpool.h"
using namespace std;
namespace onnxruntime {
@ -151,24 +148,6 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 1, 10);
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 11, 11);
REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 12);
namespace contrib {
#define REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
ReduceSumTraining, \
kMSDomain, \
1, \
T, \
kCpuExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
ReduceSumTraining<T>);
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(float)
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)
} // namespace contrib
// When all reduce axes are located at the tail of the dims, quite general cases, transpose and extra
// copy could be skipped to improve performance. If required by check_no_transpose = true, then
// the calling code will check if the data was transposed and act accordingly.
@ -184,8 +163,8 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,
const std::vector<int64_t>& axes_,
bool keepdims_,
/*out*/ std::vector<int64_t>& reduced_dims,
bool check_no_transpose = false,
const TensorShape* input_shape_override = nullptr) {
bool check_no_transpose,
const TensorShape* input_shape_override) {
ORT_ENFORCE(input_tensor_ptr != nullptr, "Input to be reduced is null");
if (input_shape_override) {
@ -614,12 +593,11 @@ Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
}
template <typename T>
static void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
concurrency::ThreadPool* tp) {
void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
concurrency::ThreadPool* tp) {
if (no_transpose) {
auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
// The ConstEigenMatrixMap type is expanded to work around a MS compiler issue
output_data[i] = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(input_data + (i * blocks), blocks).sum();
};
@ -668,43 +646,6 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
return Status::OK();
}
template <typename T>
Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
//override the attribute value with the input value for reduction_axes
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
"An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> axes(data, data + nDims);
if (axes.size() > 0) {
ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
}
// empty axes and no-op
if (axes.empty() && noop_with_empty_axes_) {
auto* output = ctx->Output(0, input->Shape());
memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes() * sizeof(T));
return Status::OK();
}
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
auto* output = ctx->Output(0, reduced_dims);
ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
return Status::OK();
}
template <typename T>
Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
@ -870,4 +811,14 @@ template class ReduceSum<int32_t>;
template class ReduceSum<double>;
template class ReduceSum<int64_t>;
#define REGISTER_REDUCESUMCORE_TYPED(T) \
template void ReduceSumCore<T>(const T* input_data, T* output_data, bool no_transpose, \
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data, \
concurrency::ThreadPool* tp);
REGISTER_REDUCESUMCORE_TYPED(float)
REGISTER_REDUCESUMCORE_TYPED(double)
REGISTER_REDUCESUMCORE_TYPED(int32_t)
REGISTER_REDUCESUMCORE_TYPED(int64_t)
} // namespace onnxruntime

View file

@ -7,9 +7,28 @@
#include "core/common/common.h"
#include "core/common/optional.h"
#include "core/framework/op_kernel.h"
#include "core/providers/cpu/containers.h"
#include "core/util/math_cpuonly.h"
#include "core/platform/threadpool.h"
namespace onnxruntime {
template <typename T>
bool PrepareForReduce(const Tensor* input_tensor_ptr,
FastAllocVector<T>& transposed_input_data,
int64_t& block_size,
int64_t& blocks,
const std::vector<int64_t>& axes_,
bool keepdims_,
/*out*/ std::vector<int64_t>& reduced_dims,
bool check_no_transpose = false,
const TensorShape* input_shape_override = nullptr);
template <typename T>
void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
concurrency::ThreadPool* tp);
template <bool allow_multi_axes>
class ReduceKernelBase {
protected:
@ -132,21 +151,6 @@ class ReduceSum final : public ReduceKernel<true> {
const TensorShape* input_shape_override = nullptr);
};
template <typename T>
class ReduceSumTraining final : public ReduceKernel<true> {
public:
ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
}
Status Compute(OpKernelContext* context) const override;
// For external calls requiring ReduceSumTraining implementation - will return the reduced output.
//`input_shape_override` overrides the shape of `input` for compute purposes.
static Tensor Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
const TensorShape* input_shape_override = nullptr);
};
template <typename T>
class ReduceSumSquare final : public ReduceKernel<true> {
public:

View file

@ -60,56 +60,6 @@ namespace cuda {
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
name<T>);
#define REGISTER_MS_KERNEL_TYPED(name, T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
name, \
kMSDomain, \
1, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.InputMemoryType<OrtMemTypeCPUInput>(1) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
name<T>);
// CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so
// it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor.
// An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t
// is implemented below, so CUDA can seamlessly work.
class CudnnReduceDescriptor final {
public:
CudnnReduceDescriptor() : desc_(nullptr) {
}
~CudnnReduceDescriptor() {
if (desc_ != nullptr) {
cudnnDestroyReduceTensorDescriptor(desc_);
desc_ = nullptr;
}
}
CudnnReduceDescriptor(const CudnnReduceDescriptor&) = delete;
CudnnReduceDescriptor& operator=(const CudnnReduceDescriptor&) = delete;
Status Set(cudnnReduceTensorOp_t op, cudnnDataType_t type, cudnnReduceTensorIndices_t indices) {
if (!desc_)
CUDNN_RETURN_IF_ERROR(cudnnCreateReduceTensorDescriptor(&desc_));
CUDNN_RETURN_IF_ERROR(cudnnSetReduceTensorDescriptor(
desc_,
op,
type,
CUDNN_PROPAGATE_NAN,
indices,
CUDNN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported.
return Status::OK();
}
operator cudnnReduceTensorDescriptor_t() const { return desc_; }
private:
cudnnReduceTensorDescriptor_t desc_;
};
// TODO ReduceKernel::ReduceKernelShared() is still used by some other training classes though it's not used here - this should be refactored.
template <bool allow_multi_axes>
@ -314,11 +264,11 @@ template Status ReduceKernel<true>::ReduceKernelShared<MLFloat16, MLFloat16, CUD
std::vector<int64_t>& output_dims) const;
// `input_shape_override` (if provided) is the input shape for compute purposes
static Status PrepareForReduce(const Tensor* X,
bool keepdims,
const std::vector<int64_t>& axes,
PrepareReduceMetadata& prepare_reduce_metadata,
const TensorShape* input_shape_override = nullptr) {
Status PrepareForReduce(const Tensor* X,
bool keepdims,
const std::vector<int64_t>& axes,
PrepareReduceMetadata& prepare_reduce_metadata,
const TensorShape* input_shape_override) {
ORT_ENFORCE(nullptr != X);
const TensorShape& input_shape = input_shape_override ? *input_shape_override : X->Shape();
@ -391,11 +341,11 @@ static Status PrepareForReduce(const Tensor* X,
// `input_shape_override` is the input shape for compute purposes (if provided)
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
/*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
const std::vector<int64_t>& axes,
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
const TensorShape* input_shape_override = nullptr) {
Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
/*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
const std::vector<int64_t>& axes,
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
const TensorShape* input_shape_override) {
typedef typename ToCudaType<T>::MappedType CudaT;
const TensorShape& input_shape = input_shape_override ? *input_shape_override : input.Shape();
@ -610,33 +560,10 @@ template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
const Tensor* X = ctx->Input<Tensor>(0);
const std::string& op_name = this->KernelDef().OpName();
std::vector<int64_t> axes_values = axes_;
if (op_name == "ReduceSumTraining") {
//override the attribute value with the input value for reduction_axes
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> axes(data, data + nDims);
axes_values = axes;
if (axes.size() > 0) {
ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
}
// empty axes and no-op
if (axes.empty() && noop_with_empty_axes_) {
auto* Y = ctx->Output(0, X->Shape());
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<T>(), X->template Data<T>(), X->SizeInBytes() * sizeof(T), cudaMemcpyDeviceToDevice));
return Status::OK();
}
}
PrepareReduceMetadata prepare_reduce_metadata;
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
keepdims_,
axes_values,
axes_,
prepare_reduce_metadata));
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
bool fast_reduction = fast_reduction_;
@ -646,7 +573,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
fast_reduction = false;
}
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_values,
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_,
calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
}
@ -657,33 +584,11 @@ Status ReduceKernel<true>::ComputeImpl<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
const Tensor* X = ctx->Input<Tensor>(0);
const std::string& op_name = this->KernelDef().OpName();
std::vector<int64_t> axes_values = axes_;
if (op_name == "ReduceSumTraining") {
//override the attribute value with the input value for reduction_axes
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> axes(data, data + nDims);
axes_values = axes;
if (axes.size() > 0) {
ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
}
// empty axes and no-op
if (axes.empty() && noop_with_empty_axes_) {
auto* Y = ctx->Output(0, X->Shape());
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), X->SizeInBytes() * sizeof(int32_t), cudaMemcpyDeviceToDevice));
return Status::OK();
}
}
PrepareReduceMetadata prepare_reduce_metadata;
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
keepdims_,
axes_values,
axes_,
prepare_reduce_metadata));
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
@ -982,11 +887,6 @@ REGISTER_KERNEL_TYPED_12(ReduceMin, int32_t)
REGISTER_KERNEL_TYPED_12(ReduceMin, int8_t)
REGISTER_KERNEL_TYPED_12(ReduceMin, uint8_t)
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16)
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float)
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double)
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t)
REGISTER_KERNEL_HFD(ReduceProd)
REGISTER_KERNEL_HFD(ReduceSum)
REGISTER_KERNEL_HFD(ReduceLogSum)

View file

@ -59,6 +59,10 @@ class ReduceKernel : public CudaKernel, public ReduceKernelBase<allow_multi_axes
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices = CUDNN_REDUCE_TENSOR_NO_INDICES>
Status ComputeImpl(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const;
// Used by ReduceSumTraining which will have axes as input
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices = CUDNN_REDUCE_TENSOR_NO_INDICES>
Status ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const;
template <typename T, typename OutT, cudnnReduceTensorIndices_t ReduceTensorIndices>
Status ReduceKernelShared(
const T* X,
@ -175,18 +179,6 @@ class ReduceSum final : public ReduceKernel<true> {
}
};
template <typename T>
class ReduceSumTraining final : public ReduceKernel<true> {
public:
ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, CUDNN_REDUCE_TENSOR_ADD);
}
};
template <typename T>
class ReduceLogSum final : public ReduceKernel<true> {
public:
@ -223,5 +215,57 @@ class ReduceLogSumExp final : public ReduceKernel<true> {
}
};
Status PrepareForReduce(const Tensor* X,
bool keepdims,
const std::vector<int64_t>& axes,
PrepareReduceMetadata& prepare_reduce_metadata,
const TensorShape* input_shape_override = nullptr);
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
/*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
const std::vector<int64_t>& axes,
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
const TensorShape* input_shape_override = nullptr);
// CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so
// it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor.
// An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t
// is implemented below, so CUDA can seamlessly work.
class CudnnReduceDescriptor final {
public:
CudnnReduceDescriptor() : desc_(nullptr) {
}
~CudnnReduceDescriptor() {
if (desc_ != nullptr) {
cudnnDestroyReduceTensorDescriptor(desc_);
desc_ = nullptr;
}
}
CudnnReduceDescriptor(const CudnnReduceDescriptor&) = delete;
CudnnReduceDescriptor& operator=(const CudnnReduceDescriptor&) = delete;
Status Set(cudnnReduceTensorOp_t op, cudnnDataType_t type, cudnnReduceTensorIndices_t indices) {
if (!desc_)
CUDNN_RETURN_IF_ERROR(cudnnCreateReduceTensorDescriptor(&desc_));
CUDNN_RETURN_IF_ERROR(cudnnSetReduceTensorDescriptor(
desc_,
op,
type,
CUDNN_PROPAGATE_NAN,
indices,
CUDNN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported.
return Status::OK();
}
operator cudnnReduceTensorDescriptor_t() const { return desc_; }
private:
cudnnReduceTensorDescriptor_t desc_;
};
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,66 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "orttraining/training_ops/cpu/reduction/reduction_ops.h"
#include "core/providers/common.h"
#include "core/util/math_cpuonly.h"
#include "core/providers/cpu/containers.h"
#include "core/platform/threadpool.h"
using namespace std;
namespace onnxruntime {
namespace contrib {
#define REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
ReduceSumTraining, \
kMSDomain, \
1, \
T, \
kCpuExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
ReduceSumTraining<T>);
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(float)
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)
template <typename T>
Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
std::vector<int64_t> reduced_dims;
const Tensor* input = ctx->Input<Tensor>(0);
//override the attribute value with the input value for reduction_axes
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
"An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> axes(data, data + nDims);
// empty axes and no-op
if (axes.empty() && noop_with_empty_axes_) {
auto* output = ctx->Output(0, input->Shape());
memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes());
return Status::OK();
}
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
auto* output = ctx->Output(0, reduced_dims);
ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
return Status::OK();
}
} // namespace contrib
} // namespace onnxruntime

View file

@ -0,0 +1,31 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/cpu/reduction/reduction_ops.h"
#include "core/common/common.h"
#include "core/common/optional.h"
#include "core/framework/op_kernel.h"
namespace onnxruntime {
namespace contrib {
template <typename T>
class ReduceSumTraining final : public ReduceKernel<true> {
public:
ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
}
Status Compute(OpKernelContext* context) const override;
// For external calls requiring ReduceSumTraining implementation - will return the reduced output.
//`input_shape_override` overrides the shape of `input` for compute purposes.
static Tensor Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
const TensorShape* input_shape_override = nullptr);
};
} // namespace contrib
} // namespace onnxruntime

View file

@ -0,0 +1,168 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "orttraining/training_ops/cuda/reduction/reduction_ops.h"
#include "core/providers/common.h"
#include "core/providers/cuda/cudnn_common.h"
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
#include "core/providers/cuda/math/binary_elementwise_ops_impl.h"
#include "core/providers/cuda/math/binary_elementwise_ops.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/framework/op_kernel_context_internal.h"
using namespace onnxruntime::common;
namespace onnxruntime {
namespace cuda {
#define REGISTER_MS_KERNEL_TYPED(name, T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
name, \
kMSDomain, \
1, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.InputMemoryType<OrtMemTypeCPUInput>(1) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
name<T>);
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16)
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float)
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double)
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t)
template <bool allow_multi_axes>
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
Status ReduceKernel<allow_multi_axes>::ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
const Tensor* X = ctx->Input<Tensor>(0);
//override the attribute value with the input value for reduction_axes
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> axes(data, data + nDims);
// empty axes and no-op
if (axes.empty() && noop_with_empty_axes_) {
auto* Y = ctx->Output(0, X->Shape());
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<T>(), X->template Data<T>(), X->SizeInBytes(), cudaMemcpyDeviceToDevice));
return Status::OK();
}
PrepareReduceMetadata prepare_reduce_metadata;
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
keepdims_,
axes,
prepare_reduce_metadata));
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
bool fast_reduction = fast_reduction_;
if (fast_reduction) {
auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
if (ctx_internal && ctx_internal->GetUseDeterministicCompute())
fast_reduction = false;
}
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes,
calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
}
template <>
template <>
Status ReduceKernel<true>::ComputeImplEx<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
typedef typename ToCudaType<int32_t>::MappedType CudaT;
const Tensor* X = ctx->Input<Tensor>(0);
//override the attribute value with the input value for reduction_axes
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> axes(data, data + nDims);
// empty axes and no-op
if (axes.empty() && noop_with_empty_axes_) {
auto* Y = ctx->Output(0, X->Shape());
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), X->SizeInBytes(), cudaMemcpyDeviceToDevice));
return Status::OK();
}
PrepareReduceMetadata prepare_reduce_metadata;
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
keepdims_,
axes,
prepare_reduce_metadata));
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
int64_t input_count = prepare_reduce_metadata.input_count;
int64_t output_count = prepare_reduce_metadata.output_count;
std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
// special case when there is a dim value of 0 in the shape.
if (input_count == 0) {
assert(Y->Shape().Size() == 0);
return Status::OK();
}
// cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case
if (input_count == output_count) {
if (Y->template MutableData<int32_t>() != X->template Data<int32_t>()) {
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), input_count * sizeof(int32_t), cudaMemcpyDeviceToDevice));
}
return Status::OK();
}
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
size_t indices_bytes = 0;
size_t workspace_bytes = 0;
CudnnTensor input_tensor;
CudnnTensor output_tensor;
CudnnReduceDescriptor reduce_desc;
cudnnDataType_t cudnn_type_X = CUDNN_DATA_FLOAT;
IAllocatorUniquePtr<float> temp_X = GetScratchBuffer<float>(input_count);
Impl_Cast<CudaT, float>(reinterpret_cast<const CudaT*>(X->template Data<int32_t>()), temp_X.get(), X->Shape().Size());
ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES));
ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_cudnn, cudnn_type_X));
ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_cudnn, cudnn_type_X));
CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(CudnnHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes));
CUDNN_RETURN_IF_ERROR(cudnnGetReductionWorkspaceSize(CudnnHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes));
IAllocatorUniquePtr<uint32_t> indices_cuda = GetScratchBuffer<uint32_t>(indices_bytes);
IAllocatorUniquePtr<CudaT> workspace_cuda = GetScratchBuffer<CudaT>(workspace_bytes);
const auto one = Consts<float>::One;
const auto zero = Consts<float>::Zero;
auto temp_Y = GetScratchBuffer<float>(output_count);
CUDNN_RETURN_IF_ERROR(cudnnReduceTensor(CudnnHandle(),
reduce_desc,
indices_cuda.get(),
indices_bytes,
workspace_cuda.get(),
workspace_bytes,
&one,
input_tensor,
temp_X.get(),
&zero,
output_tensor,
temp_Y.get()));
Impl_Cast<float, int32_t>(temp_Y.get(), Y->template MutableData<int32_t>(), output_count);
return Status::OK();
}
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,26 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/optional.h"
#include "core/providers/cuda/cuda_common.h"
#include "core/providers/cuda/reduction/reduction_ops.h"
#include "core/providers/cuda/reduction/reduction_functions.h"
namespace onnxruntime {
namespace cuda {
template <typename T>
class ReduceSumTraining final : public ReduceKernel<true> {
public:
ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImplEx<T>(ctx, CUDNN_REDUCE_TENSOR_ADD);
}
};
} // namespace cuda
} // namespace onnxruntime

View file

@ -80,6 +80,16 @@ std::vector<int64_t> GetTypeSpecificDataFromTensorProto(
onnx::TensorProto tensorProto) {
return std::vector<int64_t>(std::begin(tensorProto.int64_data()), std::end(tensorProto.int64_data()));
}
template <>
std::vector<uint8_t> GetTypeSpecificDataFromTensorProto(
onnx::TensorProto tensorProto) {
return std::vector<uint8_t>(std::begin(tensorProto.int32_data()), std::end(tensorProto.int32_data()));
}
template <>
std::vector<double> GetTypeSpecificDataFromTensorProto(
onnx::TensorProto tensorProto) {
return std::vector<double>(std::begin(tensorProto.double_data()), std::end(tensorProto.double_data()));
}
template <typename DataType>
std::vector<DataType> GetTensorDataFromTensorProto(
@ -139,6 +149,10 @@ ITensor ProtobufHelpers::LoadTensorFromProtobufFile(
return TensorInt64Bit::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<int64_t>(tensorProto, elementCount));
case (onnx::TensorProto::DataType::TensorProto_DataType_STRING):
return TensorString::CreateFromIterable(tensorShape, GetTensorStringDataFromTensorProto(tensorProto, elementCount));
case (onnx::TensorProto::DataType::TensorProto_DataType_UINT8):
return TensorUInt8Bit::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<uint8_t>(tensorProto, elementCount));
case (onnx::TensorProto::DataType::TensorProto_DataType_DOUBLE):
return TensorDouble::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<double>(tensorProto, elementCount));
default:
throw winrt::hresult_invalid_argument(L"Tensor type for creating tensor from protobuf file not supported.");
break;