diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc index fe797a6beb..6ea89a7e86 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc @@ -3,9 +3,6 @@ #include "core/providers/cpu/reduction/reduction_ops.h" #include "core/providers/common.h" -#include "core/util/math_cpuonly.h" -#include "core/providers/cpu/containers.h" -#include "core/platform/threadpool.h" using namespace std; namespace onnxruntime { @@ -151,24 +148,6 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 1, 10); REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 11, 11); REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 12); -namespace contrib { -#define REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(T) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - ReduceSumTraining, \ - kMSDomain, \ - 1, \ - T, \ - kCpuExecutionProvider, \ - KernelDefBuilder() \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - ReduceSumTraining); - -REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(float) -REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double) -REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t) -REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t) -} // namespace contrib - // When all reduce axes are located at the tail of the dims, quite general cases, transpose and extra // copy could be skipped to improve performance. If required by check_no_transpose = true, then // the calling code will check if the data was transposed and act accordingly. @@ -184,8 +163,8 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr, const std::vector& axes_, bool keepdims_, /*out*/ std::vector& reduced_dims, - bool check_no_transpose = false, - const TensorShape* input_shape_override = nullptr) { + bool check_no_transpose, + const TensorShape* input_shape_override) { ORT_ENFORCE(input_tensor_ptr != nullptr, "Input to be reduced is null"); if (input_shape_override) { @@ -614,12 +593,11 @@ Status ReduceProd::Compute(OpKernelContext* ctx) const { } template -static void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose, - int64_t blocks, int64_t block_size, FastAllocVector& transposed_input_data, - concurrency::ThreadPool* tp) { +void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose, + int64_t blocks, int64_t block_size, FastAllocVector& transposed_input_data, + concurrency::ThreadPool* tp) { if (no_transpose) { auto lambda = [input_data, blocks, output_data](ptrdiff_t i) { - // The ConstEigenMatrixMap type is expanded to work around a MS compiler issue output_data[i] = Eigen::Map>(input_data + (i * blocks), blocks).sum(); }; @@ -668,43 +646,6 @@ Status ReduceSum::Compute(OpKernelContext* ctx) const { return Status::OK(); } -template -Status ReduceSumTraining::Compute(OpKernelContext* ctx) const { - FastAllocVector transposed_input_data(GetAllocator(*ctx)); - int64_t block_size; - int64_t blocks; - std::vector reduced_dims; - const Tensor* input = ctx->Input(0); - - //override the attribute value with the input value for reduction_axes - const Tensor* axes_tensor = ctx->Input(1); - ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); - ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, - "An axes tensor must be a vector tensor."); - auto nDims = static_cast(axes_tensor->Shape()[0]); - const auto* data = axes_tensor->template Data(); - std::vector axes(data, data + nDims); - if (axes.size() > 0) { - ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed."); - } - - // empty axes and no-op - if (axes.empty() && noop_with_empty_axes_) { - auto* output = ctx->Output(0, input->Shape()); - memcpy(output->template MutableData(), input->template Data(), input->SizeInBytes() * sizeof(T)); - return Status::OK(); - } - - bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true); - - auto* output = ctx->Output(0, reduced_dims); - - ReduceSumCore(input->template Data(), output->template MutableData(), - no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool()); - - return Status::OK(); -} - template Status ReduceSumSquare::Compute(OpKernelContext* ctx) const { FastAllocVector transposed_input_data(GetAllocator(*ctx)); @@ -870,4 +811,14 @@ template class ReduceSum; template class ReduceSum; template class ReduceSum; +#define REGISTER_REDUCESUMCORE_TYPED(T) \ + template void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose, \ + int64_t blocks, int64_t block_size, FastAllocVector& transposed_input_data, \ + concurrency::ThreadPool* tp); + +REGISTER_REDUCESUMCORE_TYPED(float) +REGISTER_REDUCESUMCORE_TYPED(double) +REGISTER_REDUCESUMCORE_TYPED(int32_t) +REGISTER_REDUCESUMCORE_TYPED(int64_t) + } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h index a6dbff884a..9ef1b43a40 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h @@ -7,9 +7,28 @@ #include "core/common/common.h" #include "core/common/optional.h" #include "core/framework/op_kernel.h" +#include "core/providers/cpu/containers.h" +#include "core/util/math_cpuonly.h" +#include "core/platform/threadpool.h" namespace onnxruntime { +template +bool PrepareForReduce(const Tensor* input_tensor_ptr, + FastAllocVector& transposed_input_data, + int64_t& block_size, + int64_t& blocks, + const std::vector& axes_, + bool keepdims_, + /*out*/ std::vector& reduced_dims, + bool check_no_transpose = false, + const TensorShape* input_shape_override = nullptr); + +template +void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose, + int64_t blocks, int64_t block_size, FastAllocVector& transposed_input_data, + concurrency::ThreadPool* tp); + template class ReduceKernelBase { protected: @@ -132,21 +151,6 @@ class ReduceSum final : public ReduceKernel { const TensorShape* input_shape_override = nullptr); }; -template -class ReduceSumTraining final : public ReduceKernel { - public: - ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel(info) { - } - - Status Compute(OpKernelContext* context) const override; - - // For external calls requiring ReduceSumTraining implementation - will return the reduced output. - //`input_shape_override` overrides the shape of `input` for compute purposes. - static Tensor Impl(const Tensor& input, const std::vector& reduce_axes, - AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims, - const TensorShape* input_shape_override = nullptr); -}; - template class ReduceSumSquare final : public ReduceKernel { public: diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 592dc1ed51..4723246fcf 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -60,56 +60,6 @@ namespace cuda { KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ name); -#define REGISTER_MS_KERNEL_TYPED(name, T) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - name, \ - kMSDomain, \ - 1, \ - T, \ - kCudaExecutionProvider, \ - KernelDefBuilder() \ - .InputMemoryType(1) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - name); - -// CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so -// it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor. -// An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t -// is implemented below, so CUDA can seamlessly work. -class CudnnReduceDescriptor final { - public: - CudnnReduceDescriptor() : desc_(nullptr) { - } - - ~CudnnReduceDescriptor() { - if (desc_ != nullptr) { - cudnnDestroyReduceTensorDescriptor(desc_); - desc_ = nullptr; - } - } - - CudnnReduceDescriptor(const CudnnReduceDescriptor&) = delete; - CudnnReduceDescriptor& operator=(const CudnnReduceDescriptor&) = delete; - - Status Set(cudnnReduceTensorOp_t op, cudnnDataType_t type, cudnnReduceTensorIndices_t indices) { - if (!desc_) - CUDNN_RETURN_IF_ERROR(cudnnCreateReduceTensorDescriptor(&desc_)); - - CUDNN_RETURN_IF_ERROR(cudnnSetReduceTensorDescriptor( - desc_, - op, - type, - CUDNN_PROPAGATE_NAN, - indices, - CUDNN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported. - return Status::OK(); - } - - operator cudnnReduceTensorDescriptor_t() const { return desc_; } - - private: - cudnnReduceTensorDescriptor_t desc_; -}; // TODO ReduceKernel::ReduceKernelShared() is still used by some other training classes though it's not used here - this should be refactored. template @@ -314,11 +264,11 @@ template Status ReduceKernel::ReduceKernelShared& output_dims) const; // `input_shape_override` (if provided) is the input shape for compute purposes -static Status PrepareForReduce(const Tensor* X, - bool keepdims, - const std::vector& axes, - PrepareReduceMetadata& prepare_reduce_metadata, - const TensorShape* input_shape_override = nullptr) { +Status PrepareForReduce(const Tensor* X, + bool keepdims, + const std::vector& axes, + PrepareReduceMetadata& prepare_reduce_metadata, + const TensorShape* input_shape_override) { ORT_ENFORCE(nullptr != X); const TensorShape& input_shape = input_shape_override ? *input_shape_override : X->Shape(); @@ -391,11 +341,11 @@ static Status PrepareForReduce(const Tensor* X, // `input_shape_override` is the input shape for compute purposes (if provided) template -static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, - /*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op, - const std::vector& axes, - bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, - const TensorShape* input_shape_override = nullptr) { +Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, + /*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op, + const std::vector& axes, + bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, + const TensorShape* input_shape_override) { typedef typename ToCudaType::MappedType CudaT; const TensorShape& input_shape = input_shape_override ? *input_shape_override : input.Shape(); @@ -610,33 +560,10 @@ template Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const { const Tensor* X = ctx->Input(0); - const std::string& op_name = this->KernelDef().OpName(); - std::vector axes_values = axes_; - if (op_name == "ReduceSumTraining") { - //override the attribute value with the input value for reduction_axes - const Tensor* axes_tensor = ctx->Input(1); - ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); - ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor."); - auto nDims = static_cast(axes_tensor->Shape()[0]); - const auto* data = axes_tensor->template Data(); - std::vector axes(data, data + nDims); - axes_values = axes; - if (axes.size() > 0) { - ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed."); - } - - // empty axes and no-op - if (axes.empty() && noop_with_empty_axes_) { - auto* Y = ctx->Output(0, X->Shape()); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes() * sizeof(T), cudaMemcpyDeviceToDevice)); - return Status::OK(); - } - } - PrepareReduceMetadata prepare_reduce_metadata; ORT_RETURN_IF_ERROR(PrepareForReduce(X, keepdims_, - axes_values, + axes_, prepare_reduce_metadata)); Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); bool fast_reduction = fast_reduction_; @@ -646,7 +573,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe fast_reduction = false; } - return ReduceComputeCore(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_values, + return ReduceComputeCore(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_, calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); } @@ -657,33 +584,11 @@ Status ReduceKernel::ComputeImpl( const Tensor* X = ctx->Input(0); - const std::string& op_name = this->KernelDef().OpName(); - std::vector axes_values = axes_; - if (op_name == "ReduceSumTraining") { - //override the attribute value with the input value for reduction_axes - const Tensor* axes_tensor = ctx->Input(1); - ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor."); - auto nDims = static_cast(axes_tensor->Shape()[0]); - const auto* data = axes_tensor->template Data(); - std::vector axes(data, data + nDims); - axes_values = axes; - if (axes.size() > 0) { - ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed."); - } - - // empty axes and no-op - if (axes.empty() && noop_with_empty_axes_) { - auto* Y = ctx->Output(0, X->Shape()); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes() * sizeof(int32_t), cudaMemcpyDeviceToDevice)); - return Status::OK(); - } - } - PrepareReduceMetadata prepare_reduce_metadata; ORT_RETURN_IF_ERROR(PrepareForReduce(X, keepdims_, - axes_values, + axes_, prepare_reduce_metadata)); Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); @@ -982,11 +887,6 @@ REGISTER_KERNEL_TYPED_12(ReduceMin, int32_t) REGISTER_KERNEL_TYPED_12(ReduceMin, int8_t) REGISTER_KERNEL_TYPED_12(ReduceMin, uint8_t) -REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16) -REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float) -REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double) -REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t) - REGISTER_KERNEL_HFD(ReduceProd) REGISTER_KERNEL_HFD(ReduceSum) REGISTER_KERNEL_HFD(ReduceLogSum) diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h index 8cc8111f25..8f3ddf80c8 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h @@ -59,6 +59,10 @@ class ReduceKernel : public CudaKernel, public ReduceKernelBase Status ComputeImpl(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const; + // Used by ReduceSumTraining which will have axes as input + template + Status ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const; + template Status ReduceKernelShared( const T* X, @@ -175,18 +179,6 @@ class ReduceSum final : public ReduceKernel { } }; -template -class ReduceSumTraining final : public ReduceKernel { - public: - ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel(info) { - fast_reduction_ = true; - } - - Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, CUDNN_REDUCE_TENSOR_ADD); - } -}; - template class ReduceLogSum final : public ReduceKernel { public: @@ -223,5 +215,57 @@ class ReduceLogSumExp final : public ReduceKernel { } }; +Status PrepareForReduce(const Tensor* X, + bool keepdims, + const std::vector& axes, + PrepareReduceMetadata& prepare_reduce_metadata, + const TensorShape* input_shape_override = nullptr); + +template +Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, + /*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op, + const std::vector& axes, + bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, + const TensorShape* input_shape_override = nullptr); + +// CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so +// it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor. +// An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t +// is implemented below, so CUDA can seamlessly work. +class CudnnReduceDescriptor final { + public: + CudnnReduceDescriptor() : desc_(nullptr) { + } + + ~CudnnReduceDescriptor() { + if (desc_ != nullptr) { + cudnnDestroyReduceTensorDescriptor(desc_); + desc_ = nullptr; + } + } + + CudnnReduceDescriptor(const CudnnReduceDescriptor&) = delete; + CudnnReduceDescriptor& operator=(const CudnnReduceDescriptor&) = delete; + + Status Set(cudnnReduceTensorOp_t op, cudnnDataType_t type, cudnnReduceTensorIndices_t indices) { + if (!desc_) + CUDNN_RETURN_IF_ERROR(cudnnCreateReduceTensorDescriptor(&desc_)); + + CUDNN_RETURN_IF_ERROR(cudnnSetReduceTensorDescriptor( + desc_, + op, + type, + CUDNN_PROPAGATE_NAN, + indices, + CUDNN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported. + return Status::OK(); + } + + operator cudnnReduceTensorDescriptor_t() const { return desc_; } + + private: + cudnnReduceTensorDescriptor_t desc_; +}; + } // namespace cuda } // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc new file mode 100644 index 0000000000..0732a43441 --- /dev/null +++ b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "orttraining/training_ops/cpu/reduction/reduction_ops.h" +#include "core/providers/common.h" +#include "core/util/math_cpuonly.h" +#include "core/providers/cpu/containers.h" +#include "core/platform/threadpool.h" + +using namespace std; +namespace onnxruntime { +namespace contrib { + +#define REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + ReduceSumTraining, \ + kMSDomain, \ + 1, \ + T, \ + kCpuExecutionProvider, \ + KernelDefBuilder() \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + ReduceSumTraining); + +REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(float) +REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double) +REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t) +REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t) + + +template +Status ReduceSumTraining::Compute(OpKernelContext* ctx) const { + FastAllocVector transposed_input_data(GetAllocator(*ctx)); + int64_t block_size; + int64_t blocks; + std::vector reduced_dims; + const Tensor* input = ctx->Input(0); + + //override the attribute value with the input value for reduction_axes + const Tensor* axes_tensor = ctx->Input(1); + ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); + ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, + "An axes tensor must be a vector tensor."); + auto nDims = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->template Data(); + std::vector axes(data, data + nDims); + + // empty axes and no-op + if (axes.empty() && noop_with_empty_axes_) { + auto* output = ctx->Output(0, input->Shape()); + memcpy(output->template MutableData(), input->template Data(), input->SizeInBytes()); + return Status::OK(); + } + + bool no_transpose = PrepareForReduce(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true); + + auto* output = ctx->Output(0, reduced_dims); + + ReduceSumCore(input->template Data(), output->template MutableData(), + no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool()); + + return Status::OK(); +} + +} // namespace contrib +} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.h b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.h new file mode 100644 index 0000000000..2ec372c718 --- /dev/null +++ b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.h @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/cpu/reduction/reduction_ops.h" +#include "core/common/common.h" +#include "core/common/optional.h" +#include "core/framework/op_kernel.h" + +namespace onnxruntime { +namespace contrib { + +template +class ReduceSumTraining final : public ReduceKernel { + public: + ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel(info) { + } + + Status Compute(OpKernelContext* context) const override; + + // For external calls requiring ReduceSumTraining implementation - will return the reduced output. + //`input_shape_override` overrides the shape of `input` for compute purposes. + static Tensor Impl(const Tensor& input, const std::vector& reduce_axes, + AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims, + const TensorShape* input_shape_override = nullptr); +}; + +} // namespace contrib +} // namespace onnxruntime + diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc new file mode 100644 index 0000000000..f07a4fc982 --- /dev/null +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc @@ -0,0 +1,168 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "orttraining/training_ops/cuda/reduction/reduction_ops.h" +#include "core/providers/common.h" +#include "core/providers/cuda/cudnn_common.h" +#include "core/providers/cuda/math/unary_elementwise_ops_impl.h" +#include "core/providers/cuda/math/binary_elementwise_ops_impl.h" +#include "core/providers/cuda/math/binary_elementwise_ops.h" +#include "core/providers/cpu/tensor/utils.h" +#include "core/framework/op_kernel_context_internal.h" + +using namespace onnxruntime::common; +namespace onnxruntime { +namespace cuda { + +#define REGISTER_MS_KERNEL_TYPED(name, T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + name, \ + kMSDomain, \ + 1, \ + T, \ + kCudaExecutionProvider, \ + KernelDefBuilder() \ + .InputMemoryType(1) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + name); + +REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16) +REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float) +REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double) +REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t) + +template +template +Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const { + const Tensor* X = ctx->Input(0); + + //override the attribute value with the input value for reduction_axes + const Tensor* axes_tensor = ctx->Input(1); + ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); + ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor."); + auto nDims = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->template Data(); + std::vector axes(data, data + nDims); + + // empty axes and no-op + if (axes.empty() && noop_with_empty_axes_) { + auto* Y = ctx->Output(0, X->Shape()); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + return Status::OK(); + } + + + PrepareReduceMetadata prepare_reduce_metadata; + ORT_RETURN_IF_ERROR(PrepareForReduce(X, + keepdims_, + axes, + prepare_reduce_metadata)); + Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); + bool fast_reduction = fast_reduction_; + if (fast_reduction) { + auto ctx_internal = static_cast(ctx); + if (ctx_internal && ctx_internal->GetUseDeterministicCompute()) + fast_reduction = false; + } + + return ReduceComputeCore(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes, + calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); +} + +template <> +template <> +Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const { + typedef typename ToCudaType::MappedType CudaT; + + const Tensor* X = ctx->Input(0); + + //override the attribute value with the input value for reduction_axes + const Tensor* axes_tensor = ctx->Input(1); + ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor."); + auto nDims = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->template Data(); + std::vector axes(data, data + nDims); + + // empty axes and no-op + if (axes.empty() && noop_with_empty_axes_) { + auto* Y = ctx->Output(0, X->Shape()); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + return Status::OK(); + } + + + + PrepareReduceMetadata prepare_reduce_metadata; + + ORT_RETURN_IF_ERROR(PrepareForReduce(X, + keepdims_, + axes, + prepare_reduce_metadata)); + + Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); + + int64_t input_count = prepare_reduce_metadata.input_count; + int64_t output_count = prepare_reduce_metadata.output_count; + std::vector& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn; + std::vector& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn; + + // special case when there is a dim value of 0 in the shape. + if (input_count == 0) { + assert(Y->Shape().Size() == 0); + return Status::OK(); + } + + // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case + if (input_count == output_count) { + if (Y->template MutableData() != X->template Data()) { + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), cudaMemcpyDeviceToDevice)); + } + return Status::OK(); + } + + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. + // Therefore zeroing out the memory is required + CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + + size_t indices_bytes = 0; + size_t workspace_bytes = 0; + CudnnTensor input_tensor; + CudnnTensor output_tensor; + CudnnReduceDescriptor reduce_desc; + + cudnnDataType_t cudnn_type_X = CUDNN_DATA_FLOAT; + IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); + Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + + ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES)); + ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_cudnn, cudnn_type_X)); + ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_cudnn, cudnn_type_X)); + CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(CudnnHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes)); + CUDNN_RETURN_IF_ERROR(cudnnGetReductionWorkspaceSize(CudnnHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes)); + IAllocatorUniquePtr indices_cuda = GetScratchBuffer(indices_bytes); + IAllocatorUniquePtr workspace_cuda = GetScratchBuffer(workspace_bytes); + + const auto one = Consts::One; + const auto zero = Consts::Zero; + auto temp_Y = GetScratchBuffer(output_count); + CUDNN_RETURN_IF_ERROR(cudnnReduceTensor(CudnnHandle(), + reduce_desc, + indices_cuda.get(), + indices_bytes, + workspace_cuda.get(), + workspace_bytes, + &one, + input_tensor, + temp_X.get(), + &zero, + output_tensor, + temp_Y.get())); + + Impl_Cast(temp_Y.get(), Y->template MutableData(), output_count); + + return Status::OK(); +} + + +} // namespace cuda +} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.h b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.h new file mode 100644 index 0000000000..d1da42e625 --- /dev/null +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.h @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/optional.h" +#include "core/providers/cuda/cuda_common.h" +#include "core/providers/cuda/reduction/reduction_ops.h" +#include "core/providers/cuda/reduction/reduction_functions.h" + +namespace onnxruntime { +namespace cuda { + +template +class ReduceSumTraining final : public ReduceKernel { + public: + ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel(info) { + fast_reduction_ = true; + } + + Status ComputeInternal(OpKernelContext* ctx) const override { + return ComputeImplEx(ctx, CUDNN_REDUCE_TENSOR_ADD); + } +}; + +} // namespace cuda +} // namespace onnxruntime diff --git a/winml/test/common/protobufHelpers.cpp b/winml/test/common/protobufHelpers.cpp index 3838cfa78e..f84c9042c7 100644 --- a/winml/test/common/protobufHelpers.cpp +++ b/winml/test/common/protobufHelpers.cpp @@ -80,6 +80,16 @@ std::vector GetTypeSpecificDataFromTensorProto( onnx::TensorProto tensorProto) { return std::vector(std::begin(tensorProto.int64_data()), std::end(tensorProto.int64_data())); } +template <> +std::vector GetTypeSpecificDataFromTensorProto( + onnx::TensorProto tensorProto) { + return std::vector(std::begin(tensorProto.int32_data()), std::end(tensorProto.int32_data())); +} +template <> +std::vector GetTypeSpecificDataFromTensorProto( + onnx::TensorProto tensorProto) { + return std::vector(std::begin(tensorProto.double_data()), std::end(tensorProto.double_data())); +} template std::vector GetTensorDataFromTensorProto( @@ -139,6 +149,10 @@ ITensor ProtobufHelpers::LoadTensorFromProtobufFile( return TensorInt64Bit::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto(tensorProto, elementCount)); case (onnx::TensorProto::DataType::TensorProto_DataType_STRING): return TensorString::CreateFromIterable(tensorShape, GetTensorStringDataFromTensorProto(tensorProto, elementCount)); + case (onnx::TensorProto::DataType::TensorProto_DataType_UINT8): + return TensorUInt8Bit::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto(tensorProto, elementCount)); + case (onnx::TensorProto::DataType::TensorProto_DataType_DOUBLE): + return TensorDouble::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto(tensorProto, elementCount)); default: throw winrt::hresult_invalid_argument(L"Tensor type for creating tensor from protobuf file not supported."); break;