mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-25 02:50:42 +00:00
Merge remote-tracking branch 'upstream/master' into DmlDev
This commit is contained in:
commit
15e1a1e1ac
9 changed files with 408 additions and 204 deletions
|
|
@ -3,9 +3,6 @@
|
|||
|
||||
#include "core/providers/cpu/reduction/reduction_ops.h"
|
||||
#include "core/providers/common.h"
|
||||
#include "core/util/math_cpuonly.h"
|
||||
#include "core/providers/cpu/containers.h"
|
||||
#include "core/platform/threadpool.h"
|
||||
|
||||
using namespace std;
|
||||
namespace onnxruntime {
|
||||
|
|
@ -151,24 +148,6 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 1, 10);
|
|||
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 11, 11);
|
||||
REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 12);
|
||||
|
||||
namespace contrib {
|
||||
#define REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(T) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
ReduceSumTraining, \
|
||||
kMSDomain, \
|
||||
1, \
|
||||
T, \
|
||||
kCpuExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
|
||||
ReduceSumTraining<T>);
|
||||
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(float)
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)
|
||||
} // namespace contrib
|
||||
|
||||
// When all reduce axes are located at the tail of the dims, quite general cases, transpose and extra
|
||||
// copy could be skipped to improve performance. If required by check_no_transpose = true, then
|
||||
// the calling code will check if the data was transposed and act accordingly.
|
||||
|
|
@ -184,8 +163,8 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,
|
|||
const std::vector<int64_t>& axes_,
|
||||
bool keepdims_,
|
||||
/*out*/ std::vector<int64_t>& reduced_dims,
|
||||
bool check_no_transpose = false,
|
||||
const TensorShape* input_shape_override = nullptr) {
|
||||
bool check_no_transpose,
|
||||
const TensorShape* input_shape_override) {
|
||||
ORT_ENFORCE(input_tensor_ptr != nullptr, "Input to be reduced is null");
|
||||
|
||||
if (input_shape_override) {
|
||||
|
|
@ -614,12 +593,11 @@ Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
static void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
|
||||
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
|
||||
concurrency::ThreadPool* tp) {
|
||||
void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
|
||||
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
|
||||
concurrency::ThreadPool* tp) {
|
||||
if (no_transpose) {
|
||||
auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
|
||||
|
||||
// The ConstEigenMatrixMap type is expanded to work around a MS compiler issue
|
||||
output_data[i] = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(input_data + (i * blocks), blocks).sum();
|
||||
};
|
||||
|
|
@ -668,43 +646,6 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
//override the attribute value with the input value for reduction_axes
|
||||
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
|
||||
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
|
||||
"An axes tensor must be a vector tensor.");
|
||||
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
|
||||
const auto* data = axes_tensor->template Data<int64_t>();
|
||||
std::vector<int64_t> axes(data, data + nDims);
|
||||
if (axes.size() > 0) {
|
||||
ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
|
||||
}
|
||||
|
||||
// empty axes and no-op
|
||||
if (axes.empty() && noop_with_empty_axes_) {
|
||||
auto* output = ctx->Output(0, input->Shape());
|
||||
memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes() * sizeof(T));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
|
||||
|
||||
auto* output = ctx->Output(0, reduced_dims);
|
||||
|
||||
ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
|
||||
no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
|
|
@ -870,4 +811,14 @@ template class ReduceSum<int32_t>;
|
|||
template class ReduceSum<double>;
|
||||
template class ReduceSum<int64_t>;
|
||||
|
||||
#define REGISTER_REDUCESUMCORE_TYPED(T) \
|
||||
template void ReduceSumCore<T>(const T* input_data, T* output_data, bool no_transpose, \
|
||||
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data, \
|
||||
concurrency::ThreadPool* tp);
|
||||
|
||||
REGISTER_REDUCESUMCORE_TYPED(float)
|
||||
REGISTER_REDUCESUMCORE_TYPED(double)
|
||||
REGISTER_REDUCESUMCORE_TYPED(int32_t)
|
||||
REGISTER_REDUCESUMCORE_TYPED(int64_t)
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -7,9 +7,28 @@
|
|||
#include "core/common/common.h"
|
||||
#include "core/common/optional.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
#include "core/providers/cpu/containers.h"
|
||||
#include "core/util/math_cpuonly.h"
|
||||
#include "core/platform/threadpool.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
template <typename T>
|
||||
bool PrepareForReduce(const Tensor* input_tensor_ptr,
|
||||
FastAllocVector<T>& transposed_input_data,
|
||||
int64_t& block_size,
|
||||
int64_t& blocks,
|
||||
const std::vector<int64_t>& axes_,
|
||||
bool keepdims_,
|
||||
/*out*/ std::vector<int64_t>& reduced_dims,
|
||||
bool check_no_transpose = false,
|
||||
const TensorShape* input_shape_override = nullptr);
|
||||
|
||||
template <typename T>
|
||||
void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
|
||||
int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
|
||||
concurrency::ThreadPool* tp);
|
||||
|
||||
template <bool allow_multi_axes>
|
||||
class ReduceKernelBase {
|
||||
protected:
|
||||
|
|
@ -132,21 +151,6 @@ class ReduceSum final : public ReduceKernel<true> {
|
|||
const TensorShape* input_shape_override = nullptr);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class ReduceSumTraining final : public ReduceKernel<true> {
|
||||
public:
|
||||
ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
|
||||
}
|
||||
|
||||
Status Compute(OpKernelContext* context) const override;
|
||||
|
||||
// For external calls requiring ReduceSumTraining implementation - will return the reduced output.
|
||||
//`input_shape_override` overrides the shape of `input` for compute purposes.
|
||||
static Tensor Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
|
||||
AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
|
||||
const TensorShape* input_shape_override = nullptr);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class ReduceSumSquare final : public ReduceKernel<true> {
|
||||
public:
|
||||
|
|
|
|||
|
|
@ -60,56 +60,6 @@ namespace cuda {
|
|||
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
|
||||
name<T>);
|
||||
|
||||
#define REGISTER_MS_KERNEL_TYPED(name, T) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
name, \
|
||||
kMSDomain, \
|
||||
1, \
|
||||
T, \
|
||||
kCudaExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(1) \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
|
||||
name<T>);
|
||||
|
||||
// CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so
|
||||
// it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor.
|
||||
// An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t
|
||||
// is implemented below, so CUDA can seamlessly work.
|
||||
class CudnnReduceDescriptor final {
|
||||
public:
|
||||
CudnnReduceDescriptor() : desc_(nullptr) {
|
||||
}
|
||||
|
||||
~CudnnReduceDescriptor() {
|
||||
if (desc_ != nullptr) {
|
||||
cudnnDestroyReduceTensorDescriptor(desc_);
|
||||
desc_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
CudnnReduceDescriptor(const CudnnReduceDescriptor&) = delete;
|
||||
CudnnReduceDescriptor& operator=(const CudnnReduceDescriptor&) = delete;
|
||||
|
||||
Status Set(cudnnReduceTensorOp_t op, cudnnDataType_t type, cudnnReduceTensorIndices_t indices) {
|
||||
if (!desc_)
|
||||
CUDNN_RETURN_IF_ERROR(cudnnCreateReduceTensorDescriptor(&desc_));
|
||||
|
||||
CUDNN_RETURN_IF_ERROR(cudnnSetReduceTensorDescriptor(
|
||||
desc_,
|
||||
op,
|
||||
type,
|
||||
CUDNN_PROPAGATE_NAN,
|
||||
indices,
|
||||
CUDNN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported.
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
operator cudnnReduceTensorDescriptor_t() const { return desc_; }
|
||||
|
||||
private:
|
||||
cudnnReduceTensorDescriptor_t desc_;
|
||||
};
|
||||
|
||||
// TODO ReduceKernel::ReduceKernelShared() is still used by some other training classes though it's not used here - this should be refactored.
|
||||
template <bool allow_multi_axes>
|
||||
|
|
@ -314,11 +264,11 @@ template Status ReduceKernel<true>::ReduceKernelShared<MLFloat16, MLFloat16, CUD
|
|||
std::vector<int64_t>& output_dims) const;
|
||||
|
||||
// `input_shape_override` (if provided) is the input shape for compute purposes
|
||||
static Status PrepareForReduce(const Tensor* X,
|
||||
bool keepdims,
|
||||
const std::vector<int64_t>& axes,
|
||||
PrepareReduceMetadata& prepare_reduce_metadata,
|
||||
const TensorShape* input_shape_override = nullptr) {
|
||||
Status PrepareForReduce(const Tensor* X,
|
||||
bool keepdims,
|
||||
const std::vector<int64_t>& axes,
|
||||
PrepareReduceMetadata& prepare_reduce_metadata,
|
||||
const TensorShape* input_shape_override) {
|
||||
ORT_ENFORCE(nullptr != X);
|
||||
|
||||
const TensorShape& input_shape = input_shape_override ? *input_shape_override : X->Shape();
|
||||
|
|
@ -391,11 +341,11 @@ static Status PrepareForReduce(const Tensor* X,
|
|||
|
||||
// `input_shape_override` is the input shape for compute purposes (if provided)
|
||||
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
|
||||
static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
|
||||
/*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
|
||||
const std::vector<int64_t>& axes,
|
||||
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
|
||||
const TensorShape* input_shape_override = nullptr) {
|
||||
Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
|
||||
/*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
|
||||
const std::vector<int64_t>& axes,
|
||||
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
|
||||
const TensorShape* input_shape_override) {
|
||||
typedef typename ToCudaType<T>::MappedType CudaT;
|
||||
const TensorShape& input_shape = input_shape_override ? *input_shape_override : input.Shape();
|
||||
|
||||
|
|
@ -610,33 +560,10 @@ template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
|
|||
Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
|
||||
const Tensor* X = ctx->Input<Tensor>(0);
|
||||
|
||||
const std::string& op_name = this->KernelDef().OpName();
|
||||
std::vector<int64_t> axes_values = axes_;
|
||||
if (op_name == "ReduceSumTraining") {
|
||||
//override the attribute value with the input value for reduction_axes
|
||||
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
|
||||
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
|
||||
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
|
||||
const auto* data = axes_tensor->template Data<int64_t>();
|
||||
std::vector<int64_t> axes(data, data + nDims);
|
||||
axes_values = axes;
|
||||
if (axes.size() > 0) {
|
||||
ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
|
||||
}
|
||||
|
||||
// empty axes and no-op
|
||||
if (axes.empty() && noop_with_empty_axes_) {
|
||||
auto* Y = ctx->Output(0, X->Shape());
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<T>(), X->template Data<T>(), X->SizeInBytes() * sizeof(T), cudaMemcpyDeviceToDevice));
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
PrepareReduceMetadata prepare_reduce_metadata;
|
||||
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
|
||||
keepdims_,
|
||||
axes_values,
|
||||
axes_,
|
||||
prepare_reduce_metadata));
|
||||
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
|
||||
bool fast_reduction = fast_reduction_;
|
||||
|
|
@ -646,7 +573,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
|
|||
fast_reduction = false;
|
||||
}
|
||||
|
||||
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_values,
|
||||
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_,
|
||||
calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
|
||||
}
|
||||
|
||||
|
|
@ -657,33 +584,11 @@ Status ReduceKernel<true>::ComputeImpl<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
|
|||
|
||||
const Tensor* X = ctx->Input<Tensor>(0);
|
||||
|
||||
const std::string& op_name = this->KernelDef().OpName();
|
||||
std::vector<int64_t> axes_values = axes_;
|
||||
if (op_name == "ReduceSumTraining") {
|
||||
//override the attribute value with the input value for reduction_axes
|
||||
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
|
||||
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
|
||||
const auto* data = axes_tensor->template Data<int64_t>();
|
||||
std::vector<int64_t> axes(data, data + nDims);
|
||||
axes_values = axes;
|
||||
if (axes.size() > 0) {
|
||||
ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
|
||||
}
|
||||
|
||||
// empty axes and no-op
|
||||
if (axes.empty() && noop_with_empty_axes_) {
|
||||
auto* Y = ctx->Output(0, X->Shape());
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), X->SizeInBytes() * sizeof(int32_t), cudaMemcpyDeviceToDevice));
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
PrepareReduceMetadata prepare_reduce_metadata;
|
||||
|
||||
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
|
||||
keepdims_,
|
||||
axes_values,
|
||||
axes_,
|
||||
prepare_reduce_metadata));
|
||||
|
||||
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
|
||||
|
|
@ -982,11 +887,6 @@ REGISTER_KERNEL_TYPED_12(ReduceMin, int32_t)
|
|||
REGISTER_KERNEL_TYPED_12(ReduceMin, int8_t)
|
||||
REGISTER_KERNEL_TYPED_12(ReduceMin, uint8_t)
|
||||
|
||||
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16)
|
||||
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float)
|
||||
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double)
|
||||
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t)
|
||||
|
||||
REGISTER_KERNEL_HFD(ReduceProd)
|
||||
REGISTER_KERNEL_HFD(ReduceSum)
|
||||
REGISTER_KERNEL_HFD(ReduceLogSum)
|
||||
|
|
|
|||
|
|
@ -59,6 +59,10 @@ class ReduceKernel : public CudaKernel, public ReduceKernelBase<allow_multi_axes
|
|||
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices = CUDNN_REDUCE_TENSOR_NO_INDICES>
|
||||
Status ComputeImpl(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const;
|
||||
|
||||
// Used by ReduceSumTraining which will have axes as input
|
||||
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices = CUDNN_REDUCE_TENSOR_NO_INDICES>
|
||||
Status ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const;
|
||||
|
||||
template <typename T, typename OutT, cudnnReduceTensorIndices_t ReduceTensorIndices>
|
||||
Status ReduceKernelShared(
|
||||
const T* X,
|
||||
|
|
@ -175,18 +179,6 @@ class ReduceSum final : public ReduceKernel<true> {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class ReduceSumTraining final : public ReduceKernel<true> {
|
||||
public:
|
||||
ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
|
||||
fast_reduction_ = true;
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* ctx) const override {
|
||||
return ComputeImpl<T>(ctx, CUDNN_REDUCE_TENSOR_ADD);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class ReduceLogSum final : public ReduceKernel<true> {
|
||||
public:
|
||||
|
|
@ -223,5 +215,57 @@ class ReduceLogSumExp final : public ReduceKernel<true> {
|
|||
}
|
||||
};
|
||||
|
||||
Status PrepareForReduce(const Tensor* X,
|
||||
bool keepdims,
|
||||
const std::vector<int64_t>& axes,
|
||||
PrepareReduceMetadata& prepare_reduce_metadata,
|
||||
const TensorShape* input_shape_override = nullptr);
|
||||
|
||||
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
|
||||
Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
|
||||
/*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
|
||||
const std::vector<int64_t>& axes,
|
||||
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
|
||||
const TensorShape* input_shape_override = nullptr);
|
||||
|
||||
// CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so
|
||||
// it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor.
|
||||
// An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t
|
||||
// is implemented below, so CUDA can seamlessly work.
|
||||
class CudnnReduceDescriptor final {
|
||||
public:
|
||||
CudnnReduceDescriptor() : desc_(nullptr) {
|
||||
}
|
||||
|
||||
~CudnnReduceDescriptor() {
|
||||
if (desc_ != nullptr) {
|
||||
cudnnDestroyReduceTensorDescriptor(desc_);
|
||||
desc_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
CudnnReduceDescriptor(const CudnnReduceDescriptor&) = delete;
|
||||
CudnnReduceDescriptor& operator=(const CudnnReduceDescriptor&) = delete;
|
||||
|
||||
Status Set(cudnnReduceTensorOp_t op, cudnnDataType_t type, cudnnReduceTensorIndices_t indices) {
|
||||
if (!desc_)
|
||||
CUDNN_RETURN_IF_ERROR(cudnnCreateReduceTensorDescriptor(&desc_));
|
||||
|
||||
CUDNN_RETURN_IF_ERROR(cudnnSetReduceTensorDescriptor(
|
||||
desc_,
|
||||
op,
|
||||
type,
|
||||
CUDNN_PROPAGATE_NAN,
|
||||
indices,
|
||||
CUDNN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported.
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
operator cudnnReduceTensorDescriptor_t() const { return desc_; }
|
||||
|
||||
private:
|
||||
cudnnReduceTensorDescriptor_t desc_;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -0,0 +1,66 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "orttraining/training_ops/cpu/reduction/reduction_ops.h"
|
||||
#include "core/providers/common.h"
|
||||
#include "core/util/math_cpuonly.h"
|
||||
#include "core/providers/cpu/containers.h"
|
||||
#include "core/platform/threadpool.h"
|
||||
|
||||
using namespace std;
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
#define REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(T) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
ReduceSumTraining, \
|
||||
kMSDomain, \
|
||||
1, \
|
||||
T, \
|
||||
kCpuExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
|
||||
ReduceSumTraining<T>);
|
||||
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(float)
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
|
||||
REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)
|
||||
|
||||
|
||||
template <typename T>
|
||||
Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
|
||||
FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
std::vector<int64_t> reduced_dims;
|
||||
const Tensor* input = ctx->Input<Tensor>(0);
|
||||
|
||||
//override the attribute value with the input value for reduction_axes
|
||||
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
|
||||
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
|
||||
"An axes tensor must be a vector tensor.");
|
||||
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
|
||||
const auto* data = axes_tensor->template Data<int64_t>();
|
||||
std::vector<int64_t> axes(data, data + nDims);
|
||||
|
||||
// empty axes and no-op
|
||||
if (axes.empty() && noop_with_empty_axes_) {
|
||||
auto* output = ctx->Output(0, input->Shape());
|
||||
memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
|
||||
|
||||
auto* output = ctx->Output(0, reduced_dims);
|
||||
|
||||
ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
|
||||
no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/providers/cpu/reduction/reduction_ops.h"
|
||||
#include "core/common/common.h"
|
||||
#include "core/common/optional.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
template <typename T>
|
||||
class ReduceSumTraining final : public ReduceKernel<true> {
|
||||
public:
|
||||
ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
|
||||
}
|
||||
|
||||
Status Compute(OpKernelContext* context) const override;
|
||||
|
||||
// For external calls requiring ReduceSumTraining implementation - will return the reduced output.
|
||||
//`input_shape_override` overrides the shape of `input` for compute purposes.
|
||||
static Tensor Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
|
||||
AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
|
||||
const TensorShape* input_shape_override = nullptr);
|
||||
};
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "orttraining/training_ops/cuda/reduction/reduction_ops.h"
|
||||
#include "core/providers/common.h"
|
||||
#include "core/providers/cuda/cudnn_common.h"
|
||||
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
|
||||
#include "core/providers/cuda/math/binary_elementwise_ops_impl.h"
|
||||
#include "core/providers/cuda/math/binary_elementwise_ops.h"
|
||||
#include "core/providers/cpu/tensor/utils.h"
|
||||
#include "core/framework/op_kernel_context_internal.h"
|
||||
|
||||
using namespace onnxruntime::common;
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
#define REGISTER_MS_KERNEL_TYPED(name, T) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
name, \
|
||||
kMSDomain, \
|
||||
1, \
|
||||
T, \
|
||||
kCudaExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(1) \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
|
||||
name<T>);
|
||||
|
||||
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16)
|
||||
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float)
|
||||
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double)
|
||||
REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t)
|
||||
|
||||
template <bool allow_multi_axes>
|
||||
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
|
||||
Status ReduceKernel<allow_multi_axes>::ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
|
||||
const Tensor* X = ctx->Input<Tensor>(0);
|
||||
|
||||
//override the attribute value with the input value for reduction_axes
|
||||
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
|
||||
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
|
||||
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
|
||||
const auto* data = axes_tensor->template Data<int64_t>();
|
||||
std::vector<int64_t> axes(data, data + nDims);
|
||||
|
||||
// empty axes and no-op
|
||||
if (axes.empty() && noop_with_empty_axes_) {
|
||||
auto* Y = ctx->Output(0, X->Shape());
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<T>(), X->template Data<T>(), X->SizeInBytes(), cudaMemcpyDeviceToDevice));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
||||
PrepareReduceMetadata prepare_reduce_metadata;
|
||||
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
|
||||
keepdims_,
|
||||
axes,
|
||||
prepare_reduce_metadata));
|
||||
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
|
||||
bool fast_reduction = fast_reduction_;
|
||||
if (fast_reduction) {
|
||||
auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
|
||||
if (ctx_internal && ctx_internal->GetUseDeterministicCompute())
|
||||
fast_reduction = false;
|
||||
}
|
||||
|
||||
return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes,
|
||||
calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
|
||||
}
|
||||
|
||||
template <>
|
||||
template <>
|
||||
Status ReduceKernel<true>::ComputeImplEx<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
|
||||
typedef typename ToCudaType<int32_t>::MappedType CudaT;
|
||||
|
||||
const Tensor* X = ctx->Input<Tensor>(0);
|
||||
|
||||
//override the attribute value with the input value for reduction_axes
|
||||
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
|
||||
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
|
||||
const auto* data = axes_tensor->template Data<int64_t>();
|
||||
std::vector<int64_t> axes(data, data + nDims);
|
||||
|
||||
// empty axes and no-op
|
||||
if (axes.empty() && noop_with_empty_axes_) {
|
||||
auto* Y = ctx->Output(0, X->Shape());
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), X->SizeInBytes(), cudaMemcpyDeviceToDevice));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
||||
|
||||
PrepareReduceMetadata prepare_reduce_metadata;
|
||||
|
||||
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
|
||||
keepdims_,
|
||||
axes,
|
||||
prepare_reduce_metadata));
|
||||
|
||||
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
|
||||
|
||||
int64_t input_count = prepare_reduce_metadata.input_count;
|
||||
int64_t output_count = prepare_reduce_metadata.output_count;
|
||||
std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
|
||||
std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
|
||||
|
||||
// special case when there is a dim value of 0 in the shape.
|
||||
if (input_count == 0) {
|
||||
assert(Y->Shape().Size() == 0);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case
|
||||
if (input_count == output_count) {
|
||||
if (Y->template MutableData<int32_t>() != X->template Data<int32_t>()) {
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), input_count * sizeof(int32_t), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
|
||||
|
||||
size_t indices_bytes = 0;
|
||||
size_t workspace_bytes = 0;
|
||||
CudnnTensor input_tensor;
|
||||
CudnnTensor output_tensor;
|
||||
CudnnReduceDescriptor reduce_desc;
|
||||
|
||||
cudnnDataType_t cudnn_type_X = CUDNN_DATA_FLOAT;
|
||||
IAllocatorUniquePtr<float> temp_X = GetScratchBuffer<float>(input_count);
|
||||
Impl_Cast<CudaT, float>(reinterpret_cast<const CudaT*>(X->template Data<int32_t>()), temp_X.get(), X->Shape().Size());
|
||||
|
||||
ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES));
|
||||
ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_cudnn, cudnn_type_X));
|
||||
ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_cudnn, cudnn_type_X));
|
||||
CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(CudnnHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes));
|
||||
CUDNN_RETURN_IF_ERROR(cudnnGetReductionWorkspaceSize(CudnnHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes));
|
||||
IAllocatorUniquePtr<uint32_t> indices_cuda = GetScratchBuffer<uint32_t>(indices_bytes);
|
||||
IAllocatorUniquePtr<CudaT> workspace_cuda = GetScratchBuffer<CudaT>(workspace_bytes);
|
||||
|
||||
const auto one = Consts<float>::One;
|
||||
const auto zero = Consts<float>::Zero;
|
||||
auto temp_Y = GetScratchBuffer<float>(output_count);
|
||||
CUDNN_RETURN_IF_ERROR(cudnnReduceTensor(CudnnHandle(),
|
||||
reduce_desc,
|
||||
indices_cuda.get(),
|
||||
indices_bytes,
|
||||
workspace_cuda.get(),
|
||||
workspace_bytes,
|
||||
&one,
|
||||
input_tensor,
|
||||
temp_X.get(),
|
||||
&zero,
|
||||
output_tensor,
|
||||
temp_Y.get()));
|
||||
|
||||
Impl_Cast<float, int32_t>(temp_Y.get(), Y->template MutableData<int32_t>(), output_count);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
#include "core/common/optional.h"
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
#include "core/providers/cuda/reduction/reduction_ops.h"
|
||||
#include "core/providers/cuda/reduction/reduction_functions.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <typename T>
|
||||
class ReduceSumTraining final : public ReduceKernel<true> {
|
||||
public:
|
||||
ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
|
||||
fast_reduction_ = true;
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* ctx) const override {
|
||||
return ComputeImplEx<T>(ctx, CUDNN_REDUCE_TENSOR_ADD);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -80,6 +80,16 @@ std::vector<int64_t> GetTypeSpecificDataFromTensorProto(
|
|||
onnx::TensorProto tensorProto) {
|
||||
return std::vector<int64_t>(std::begin(tensorProto.int64_data()), std::end(tensorProto.int64_data()));
|
||||
}
|
||||
template <>
|
||||
std::vector<uint8_t> GetTypeSpecificDataFromTensorProto(
|
||||
onnx::TensorProto tensorProto) {
|
||||
return std::vector<uint8_t>(std::begin(tensorProto.int32_data()), std::end(tensorProto.int32_data()));
|
||||
}
|
||||
template <>
|
||||
std::vector<double> GetTypeSpecificDataFromTensorProto(
|
||||
onnx::TensorProto tensorProto) {
|
||||
return std::vector<double>(std::begin(tensorProto.double_data()), std::end(tensorProto.double_data()));
|
||||
}
|
||||
|
||||
template <typename DataType>
|
||||
std::vector<DataType> GetTensorDataFromTensorProto(
|
||||
|
|
@ -139,6 +149,10 @@ ITensor ProtobufHelpers::LoadTensorFromProtobufFile(
|
|||
return TensorInt64Bit::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<int64_t>(tensorProto, elementCount));
|
||||
case (onnx::TensorProto::DataType::TensorProto_DataType_STRING):
|
||||
return TensorString::CreateFromIterable(tensorShape, GetTensorStringDataFromTensorProto(tensorProto, elementCount));
|
||||
case (onnx::TensorProto::DataType::TensorProto_DataType_UINT8):
|
||||
return TensorUInt8Bit::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<uint8_t>(tensorProto, elementCount));
|
||||
case (onnx::TensorProto::DataType::TensorProto_DataType_DOUBLE):
|
||||
return TensorDouble::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<double>(tensorProto, elementCount));
|
||||
default:
|
||||
throw winrt::hresult_invalid_argument(L"Tensor type for creating tensor from protobuf file not supported.");
|
||||
break;
|
||||
|
|
|
|||
Loading…
Reference in a new issue