mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-04 04:07:22 +00:00
Update Dropout(12) forward kernel with training_mode input. (#3805)
* Update Dropout(12) forward and backward kernel with training_mode input. * Revert deleted assert. * clean up. * PR feedback.
This commit is contained in:
parent
111469728f
commit
a24c71af40
9 changed files with 192 additions and 202 deletions
|
|
@ -34,7 +34,7 @@ const Tensor& FetchTensor(const OrtValue& ort_value) {
|
|||
}
|
||||
|
||||
void RunDropoutTest(const char* op, const bool use_mask, const std::vector<int64_t>& input_shape, float ratio = -1,
|
||||
bool use_float16_ratio = false) {
|
||||
bool training_mode = true, bool use_float16_ratio = false) {
|
||||
OpTester t{op, k_dropout_opset_version, kOnnxDomain};
|
||||
|
||||
const auto input_size = std::accumulate(
|
||||
|
|
@ -47,12 +47,17 @@ void RunDropoutTest(const char* op, const bool use_mask, const std::vector<int64
|
|||
t.AddInput("data", input_shape, input);
|
||||
if (ratio == -1) {
|
||||
ratio = 0.5; // default.
|
||||
t.AddInput("ratio", {}, {ratio});
|
||||
} else if (use_float16_ratio) {
|
||||
t.AddInput("ratio", {}, {MLFloat16(0)});
|
||||
} else {
|
||||
t.AddInput("ratio", {}, {ratio});
|
||||
}
|
||||
|
||||
if (strcmp(op, "TrainableDropout") != 0 && training_mode) {
|
||||
t.AddInput("training_mode", {}, {true});
|
||||
}
|
||||
|
||||
t.AddOutput<float>("output", input_shape, input); // we'll do our own output verification
|
||||
|
||||
std::unique_ptr<bool[]> mask_buffer{};
|
||||
|
|
@ -117,7 +122,7 @@ TEST(DropoutTest, Mask) {
|
|||
}
|
||||
|
||||
TEST(DropoutTest, RatioLimit) {
|
||||
RunDropoutTest("Dropout", true, {1000}, 0.0f);
|
||||
RunDropoutTest("Dropout", true, {1000}, 0.0f, false);
|
||||
}
|
||||
|
||||
TEST(DropoutTest, EmptyRatio) {
|
||||
|
|
@ -125,7 +130,7 @@ TEST(DropoutTest, EmptyRatio) {
|
|||
}
|
||||
|
||||
TEST(DropoutTest, Float16Ratio) {
|
||||
RunDropoutTest("Dropout", true, {1000}, 0.0f, true);
|
||||
RunDropoutTest("Dropout", true, {1000}, 0.0f, true, true);
|
||||
}
|
||||
|
||||
TEST(TrainableDropoutTest, Basic) {
|
||||
|
|
@ -137,15 +142,15 @@ TEST(TrainableDropoutTest, Mask) {
|
|||
}
|
||||
|
||||
TEST(TrainableDropoutTest, RatioLimit) {
|
||||
RunDropoutTest("TrainableDropout", true, {1000}, 0.0f);
|
||||
RunDropoutTest("TrainableDropout", true, {1000}, 0.0f, false);
|
||||
}
|
||||
|
||||
TEST(TrainableDropoutTest, EmptyRatio) {
|
||||
RunDropoutTest("TrainableDropout", true, {1000});
|
||||
RunDropoutTest("TrainableDropout", true, {1000}, -1);
|
||||
}
|
||||
|
||||
TEST(TrainableDropoutTest, Float16Ratio) {
|
||||
RunDropoutTest("TrainableDropout", true, {1000}, 0.0f, true);
|
||||
RunDropoutTest("TrainableDropout", true, {1000}, 0.0f, true, true);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, double_MLFloat16, TrainableDropout);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, double_float, TrainableDropout);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, double_double, TrainableDropout);
|
||||
|
||||
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
|
||||
// However these types work on GPU implementation.
|
||||
//class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, TrainableDropoutGrad);
|
||||
|
|
|
|||
|
|
@ -28,10 +28,10 @@ float GetRatioOrDefault(const Tensor* ratio_tensor) {
|
|||
} // namespace
|
||||
|
||||
// Dropout
|
||||
#define REGISTER_KERNEL_TYPED(OpName, Domain, VER, T1, T2) \
|
||||
#define REGISTER_KERNEL_TYPED(OpName, VER, T1, T2, Trainable) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
OpName, \
|
||||
Domain, \
|
||||
kOnnxDomain, \
|
||||
VER, \
|
||||
T1##_##T2, \
|
||||
kCpuExecutionProvider, \
|
||||
|
|
@ -39,7 +39,16 @@ float GetRatioOrDefault(const Tensor* ratio_tensor) {
|
|||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()), \
|
||||
OpName<T1, T2>);
|
||||
Dropout<T1, T2, Trainable>);
|
||||
|
||||
// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to
|
||||
// opset-12.
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, MLFloat16, true)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, float, true)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, double, true)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, MLFloat16, true)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, float, true)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, double, true)
|
||||
|
||||
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
|
||||
// However these types work on GPU implementation.
|
||||
|
|
@ -47,26 +56,22 @@ float GetRatioOrDefault(const Tensor* ratio_tensor) {
|
|||
// REGISTER_KERNEL_TYPED(MLFloat16, float)
|
||||
// REGISTER_KERNEL_TYPED(MLFloat16, double)
|
||||
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, MLFloat16)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, float)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, double)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, MLFloat16)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, float)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, double)
|
||||
REGISTER_KERNEL_TYPED(Dropout, 12, float, MLFloat16, false)
|
||||
REGISTER_KERNEL_TYPED(Dropout, 12, float, float, false)
|
||||
REGISTER_KERNEL_TYPED(Dropout, 12, float, double, false)
|
||||
REGISTER_KERNEL_TYPED(Dropout, 12, double, MLFloat16, false)
|
||||
REGISTER_KERNEL_TYPED(Dropout, 12, double, float, false)
|
||||
REGISTER_KERNEL_TYPED(Dropout, 12, double, double, false)
|
||||
|
||||
template <typename T1, typename T2>
|
||||
Status Dropout<T1, T2>::Compute(OpKernelContext* context) const {
|
||||
template <typename T1, typename T2, bool trainable_dropout>
|
||||
Status Dropout<T1, T2, trainable_dropout>::Compute(OpKernelContext* context) const {
|
||||
const Tensor* X = context->Input<Tensor>(0);
|
||||
auto X_span = X->DataAsSpan<T1>();
|
||||
|
||||
const Tensor* ratio = context->Input<Tensor>(1); // optional
|
||||
const float ratio_value = GetRatioOrDefault<T2>(ratio);
|
||||
|
||||
const auto& X_shape = X->Shape();
|
||||
|
||||
Tensor* Y = context->Output(0, X_shape);
|
||||
auto Y_span = Y->MutableDataAsSpan<T1>();
|
||||
|
||||
Tensor* mask = context->Output(1, X_shape); // optional
|
||||
std::unique_ptr<bool[]> temp_mask_buffer{}; // temporary buffer to use if mask input is not provided
|
||||
auto mask_span = [&X_shape, mask, &temp_mask_buffer]() {
|
||||
|
|
@ -75,16 +80,21 @@ Status Dropout<T1, T2>::Compute(OpKernelContext* context) const {
|
|||
return gsl::make_span(temp_mask_buffer.get(), X_shape.Size());
|
||||
}();
|
||||
|
||||
ORT_ENFORCE(Y->Shape() == X_shape, "X and Y should have the same shape");
|
||||
ORT_ENFORCE(!mask || mask->Shape() == X_shape, "X and mask should have the same shape");
|
||||
|
||||
if (ratio_value == 0.0f) {
|
||||
const Tensor* training_mode = context->Input<Tensor>(2);
|
||||
if ((0 == ratio_value /*Backward compat with TrainableDropout*/) ||
|
||||
!trainable_dropout && (training_mode == nullptr || *(training_mode->Data<bool>()) == false)) {
|
||||
// drop none
|
||||
if (X_span.data() != Y_span.data()) {
|
||||
std::copy(X_span.begin(), X_span.end(), Y_span.begin());
|
||||
}
|
||||
std::fill(mask_span.begin(), mask_span.end(), true);
|
||||
} else if (ratio_value < 1.0f) {
|
||||
|
||||
if (mask != nullptr) {
|
||||
std::fill(mask_span.begin(), mask_span.end(), true);
|
||||
}
|
||||
|
||||
} else {
|
||||
// drop some
|
||||
ConstEigenVectorArrayMap<T1> X_arr(X_span.data(), X_span.size());
|
||||
EigenVectorArrayMap<T1> Y_arr(Y_span.data(), Y_span.size());
|
||||
|
|
@ -106,33 +116,51 @@ Status Dropout<T1, T2>::Compute(OpKernelContext* context) const {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
#define REGISTER_GRADIENT_KERNEL_TYPED(OpName, T1, T2) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
OpName, \
|
||||
kMSDomain, \
|
||||
1, \
|
||||
T1##_##T2, \
|
||||
kCpuExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()), \
|
||||
DropoutGrad<T1, T2>);
|
||||
|
||||
// DropoutGrad
|
||||
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
|
||||
// REVIEW(codemzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
|
||||
// However these types work on GPU implementation.
|
||||
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, MLFloat16)
|
||||
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, float)
|
||||
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, double)
|
||||
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, MLFloat16)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, float)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, double)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, MLFloat16)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, float)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, double)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, double)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, double)
|
||||
|
||||
// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to
|
||||
// opset-12.
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, double)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, double)
|
||||
|
||||
template <typename T1, typename T2>
|
||||
Status DropoutGrad<T1, T2>::Compute(OpKernelContext* context) const {
|
||||
const Tensor* dY = context->Input<Tensor>(0);
|
||||
auto dY_span = dY->DataAsSpan<T1>();
|
||||
|
||||
const Tensor* mask = context->Input<Tensor>(1);
|
||||
auto mask_span = mask->DataAsSpan<bool>();
|
||||
|
||||
const Tensor* ratio = context->Input<Tensor>(2); // optional
|
||||
const float ratio_value = GetRatioOrDefault<T2>(ratio);
|
||||
|
||||
const auto& dY_shape = dY->Shape();
|
||||
|
||||
Tensor* dX = context->Output(0, dY_shape);
|
||||
auto dX_span = dX->MutableDataAsSpan<T1>();
|
||||
|
||||
|
|
|
|||
|
|
@ -9,8 +9,8 @@
|
|||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
template <typename T1, typename T2>
|
||||
class Dropout final : public OpKernel {
|
||||
template <typename T1, typename T2, bool trainable_dropout>
|
||||
class Dropout final: public OpKernel {
|
||||
public:
|
||||
Dropout(const OpKernelInfo& info) : OpKernel{info} {
|
||||
int64_t seed = 0;
|
||||
|
|
@ -28,7 +28,8 @@ class Dropout final : public OpKernel {
|
|||
template <typename T1, typename T2>
|
||||
class DropoutGrad final : public OpKernel {
|
||||
public:
|
||||
DropoutGrad(const OpKernelInfo& info) : OpKernel{info} {}
|
||||
DropoutGrad(const OpKernelInfo& info) : OpKernel{info} {
|
||||
}
|
||||
|
||||
Status Compute(OpKernelContext* context) const override;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,58 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "orttraining/training_ops/cpu/nn/dropout_op.h"
|
||||
#include <chrono>
|
||||
#include "core/util/math_cpuonly.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
// TrainableDropout is the same as Dropout V12.
|
||||
// Registering the operator for the sake of backward compatibility.
|
||||
// Give notice to the users to use Dropout V12 and then deprecate this kernel.
|
||||
|
||||
// TrainableDropout
|
||||
#define REGISTER_KERNEL_TYPED(OpName, Domain, VER, T1, T2, ClassName) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
OpName, \
|
||||
Domain, \
|
||||
VER, \
|
||||
T1##_##T2, \
|
||||
kCpuExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()), \
|
||||
ClassName<T1, T2>);
|
||||
|
||||
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
|
||||
// However these types work on GPU implementation.
|
||||
// REGISTER_KERNEL_TYPED(MLFloat16, MLFloat16)
|
||||
// REGISTER_KERNEL_TYPED(MLFloat16, float)
|
||||
// REGISTER_KERNEL_TYPED(MLFloat16, double)
|
||||
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, MLFloat16, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, float, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, double, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, MLFloat16, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, float, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, double, Dropout)
|
||||
|
||||
|
||||
// TrainableDropoutGrad
|
||||
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
|
||||
// However these types work on GPU implementation.
|
||||
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, MLFloat16)
|
||||
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, float)
|
||||
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, double)
|
||||
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, MLFloat16, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, float, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, double, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, MLFloat16, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, float, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, double, DropoutGrad)
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -9,33 +9,58 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
#define REGISTER_KERNEL_TYPED(OpName, Domain, VER, T1, T2, MemIndex) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
OpName, \
|
||||
Domain, \
|
||||
VER, \
|
||||
T1##_##T2, \
|
||||
kCudaExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()) \
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(MemIndex), \
|
||||
OpName<T1, T2>);
|
||||
#define REGISTER_KERNEL_TYPED(T1, T2) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
Dropout, \
|
||||
kOnnxDomain, \
|
||||
12, \
|
||||
T1##_##T2, \
|
||||
kCudaExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()) \
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(1) \
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(2), \
|
||||
Dropout<T1, T2, false>);
|
||||
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, MLFloat16, MLFloat16, 1)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, MLFloat16, float, 1)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, MLFloat16, double, 1)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, MLFloat16, 1)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, float, 1)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, double, 1)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, MLFloat16, 1)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, float, 1)
|
||||
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, double, 1)
|
||||
REGISTER_KERNEL_TYPED(MLFloat16, MLFloat16)
|
||||
REGISTER_KERNEL_TYPED(MLFloat16, float)
|
||||
REGISTER_KERNEL_TYPED(MLFloat16, double)
|
||||
REGISTER_KERNEL_TYPED(float, MLFloat16)
|
||||
REGISTER_KERNEL_TYPED(float, float)
|
||||
REGISTER_KERNEL_TYPED(float, double)
|
||||
REGISTER_KERNEL_TYPED(double, MLFloat16)
|
||||
REGISTER_KERNEL_TYPED(double, float)
|
||||
REGISTER_KERNEL_TYPED(double, double)
|
||||
|
||||
#define REGISTER_TRAINABLE_KERNEL_TYPED(T1, T2) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
TrainableDropout, \
|
||||
kOnnxDomain, \
|
||||
9, \
|
||||
T1##_##T2, \
|
||||
kCudaExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(1), \
|
||||
Dropout<T1, T2, true>);
|
||||
|
||||
template <typename T1, typename T2>
|
||||
Status Dropout<T1, T2>::ComputeInternal(OpKernelContext* context) const {
|
||||
// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to
|
||||
// opset-12.
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(MLFloat16, MLFloat16)
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(MLFloat16, float)
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(MLFloat16, double)
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(float, MLFloat16)
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(float, float)
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(float, double)
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(double, MLFloat16)
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(double, float)
|
||||
REGISTER_TRAINABLE_KERNEL_TYPED(double, double)
|
||||
|
||||
template <typename T1, typename T2, bool trainable_dropout>
|
||||
Status Dropout<T1, T2, trainable_dropout>::ComputeInternal(OpKernelContext* context) const {
|
||||
typedef typename ToCudaType<T1>::MappedType CudaT;
|
||||
|
||||
//Get X_data
|
||||
|
|
@ -52,12 +77,6 @@ Status Dropout<T1, T2>::ComputeInternal(OpKernelContext* context) const {
|
|||
//Get mask_data
|
||||
auto mask = context->Output(1, shape);
|
||||
ORT_ENFORCE(!mask || mask->Shape().Size() == N);
|
||||
IAllocatorUniquePtr<bool> temp_mask_buffer{}; // buffer to use if mask is not provided
|
||||
bool* const mask_data = [this, N, mask, &temp_mask_buffer]() {
|
||||
if (mask) return mask->MutableData<bool>();
|
||||
temp_mask_buffer = GetScratchBuffer<bool>(N);
|
||||
return temp_mask_buffer.get();
|
||||
}();
|
||||
|
||||
//Get the ratio_data
|
||||
float ratio_data;
|
||||
|
|
@ -73,21 +92,70 @@ Status Dropout<T1, T2>::ComputeInternal(OpKernelContext* context) const {
|
|||
}
|
||||
ORT_ENFORCE(ratio_data >= 0.0f && ratio_data < 1.0f);
|
||||
|
||||
const Tensor* training_mode = context->Input<Tensor>(2);
|
||||
//Check for inference mode.
|
||||
if ((0 == ratio_data /*Backward compat with TrainableDropout*/) ||
|
||||
(!trainable_dropout && (training_mode == nullptr || *(training_mode->Data<bool>()) == false))) {
|
||||
if (Y_data != X_data) {
|
||||
CUDA_CALL_THROW(cudaMemcpyAsync(Y_data, X_data, N * sizeof(T1), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
// If mask is requested, return all 1s.
|
||||
if (mask != nullptr) {
|
||||
ORT_ENFORCE(cudaMemset(mask->MutableData<bool>(), true, N * sizeof(bool)) == cudaSuccess);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
IAllocatorUniquePtr<bool> temp_mask_buffer{}; // buffer to use if mask is not provided
|
||||
bool* const mask_data = [this, N, mask, &temp_mask_buffer]() {
|
||||
if (mask) return mask->MutableData<bool>();
|
||||
temp_mask_buffer = GetScratchBuffer<bool>(N);
|
||||
return temp_mask_buffer.get();
|
||||
}();
|
||||
|
||||
PhiloxGenerator& generator = generator_ != nullptr ? *generator_.get() : PhiloxGenerator::Default();
|
||||
DropoutKernelImpl(GetDeviceProp(), N, ratio_data, generator, X_data, Y_data, mask_data);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, MLFloat16, MLFloat16, 2)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, MLFloat16, float, 2)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, MLFloat16, double, 2)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, MLFloat16, 2)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, float, 2)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, double, 2)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, MLFloat16, 2)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, float, 2)
|
||||
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, double, 2)
|
||||
#define REGISTER_GRADIENT_KERNEL_TYPED(OpName, T1, T2) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
OpName, \
|
||||
kMSDomain, \
|
||||
1, \
|
||||
T1##_##T2, \
|
||||
kCudaExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()) \
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(2), \
|
||||
DropoutGrad<T1, T2>);
|
||||
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, MLFloat16, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, MLFloat16, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, MLFloat16, double)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, double)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, double)
|
||||
|
||||
// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to
|
||||
// opset-12.
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, MLFloat16, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, MLFloat16, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, MLFloat16, double)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, double)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, MLFloat16)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, float)
|
||||
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, double)
|
||||
|
||||
template <typename T1, typename T2>
|
||||
Status DropoutGrad<T1, T2>::ComputeInternal(OpKernelContext* context) const {
|
||||
|
|
@ -103,7 +171,6 @@ Status DropoutGrad<T1, T2>::ComputeInternal(OpKernelContext* context) const {
|
|||
|
||||
auto dX = context->Output(0, shape);
|
||||
auto dX_data = reinterpret_cast<CudaT*>(dX->template MutableData<T1>());
|
||||
|
||||
float ratio_data;
|
||||
auto ratio = context->Input<Tensor>(2);
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <typename T1, typename T2>
|
||||
template <typename T1, typename T2, bool trainable_dropout>
|
||||
class Dropout final : public CudaKernel {
|
||||
public:
|
||||
Dropout(const OpKernelInfo& info) : CudaKernel(info), default_ratio_(0.5) {
|
||||
|
|
@ -29,7 +29,9 @@ class Dropout final : public CudaKernel {
|
|||
template <typename T1, typename T2>
|
||||
class DropoutGrad final : public CudaKernel {
|
||||
public:
|
||||
DropoutGrad(const OpKernelInfo& info) : CudaKernel(info), default_ratio_(0.5) {}
|
||||
DropoutGrad(const OpKernelInfo& info) : CudaKernel(info), default_ratio_(0.5) {
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
|
||||
private:
|
||||
|
|
|
|||
|
|
@ -21,9 +21,6 @@
|
|||
#include <curand_kernel.h>
|
||||
#include <algorithm>
|
||||
|
||||
#include "thrust/device_ptr.h"
|
||||
#include "thrust/fill.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
|
|
@ -77,22 +74,15 @@ void DropoutKernelImpl(
|
|||
const T* X_data,
|
||||
T* Y_data,
|
||||
bool* mask_data) {
|
||||
if (ratio == 0.0f) {
|
||||
if (Y_data != X_data) {
|
||||
CUDA_CALL_THROW(cudaMemcpyAsync(Y_data, X_data, N * sizeof(T), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
thrust::fill_n(thrust::device_pointer_cast(mask_data), N, true);
|
||||
} else {
|
||||
const int block_size = 256;
|
||||
const int blocks_per_sm = prop.maxThreadsPerMultiProcessor / block_size;
|
||||
const int grid_size = std::min(prop.multiProcessorCount * blocks_per_sm, static_cast<int>(CeilDiv(N, block_size)));
|
||||
const int block_size = 256;
|
||||
const int blocks_per_sm = prop.maxThreadsPerMultiProcessor / block_size;
|
||||
const int grid_size = std::min(prop.multiProcessorCount * blocks_per_sm, static_cast<int>(CeilDiv(N, block_size)));
|
||||
|
||||
// Compute the number of random numbers generated by each thread, and increment philox generator offset by that amount.
|
||||
const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
|
||||
auto seeds = generator.NextPhiloxSeeds(counter_offset);
|
||||
// Compute the number of random numbers generated by each thread, and increment philox generator offset by that amount.
|
||||
const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
|
||||
auto seeds = generator.NextPhiloxSeeds(counter_offset);
|
||||
|
||||
DropoutKernel<T><<<grid_size, block_size, 0>>>(N, ratio, seeds, X_data, Y_data, mask_data);
|
||||
}
|
||||
DropoutKernel<T><<<grid_size, block_size, 0>>>(N, ratio, seeds, X_data, Y_data, mask_data);
|
||||
}
|
||||
|
||||
#define SPECIALIZED_DROPOUT_IMPL(T) \
|
||||
|
|
|
|||
|
|
@ -1,46 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "orttraining/training_ops/cuda/nn/dropout.h"
|
||||
|
||||
#include "core/providers/common.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
#define REGISTER_KERNEL_TYPED(OpName, Domain, VER, T1, T2, MemIndex, ClassName) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
OpName, \
|
||||
Domain, \
|
||||
VER, \
|
||||
T1##_##T2, \
|
||||
kCudaExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()) \
|
||||
.InputMemoryType<OrtMemTypeCPUInput>(MemIndex), \
|
||||
ClassName<T1, T2>);
|
||||
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, MLFloat16, MLFloat16, 1, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, MLFloat16, float, 1, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, MLFloat16, double, 1, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, MLFloat16, 1, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, float, 1, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, double, 1, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, MLFloat16, 1, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, float, 1, Dropout)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, double, 1, Dropout)
|
||||
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, MLFloat16, MLFloat16, 2, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, MLFloat16, float, 2, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, MLFloat16, double, 2, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, MLFloat16, 2, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, float, 2, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, double, 2, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, MLFloat16, 2, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, float, 2, DropoutGrad)
|
||||
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, double, 2, DropoutGrad)
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
Loading…
Reference in a new issue