Update Dropout(12) forward kernel with training_mode input. (#3805)

* Update Dropout(12) forward and backward kernel with training_mode input.

* Revert deleted assert.

* clean up.

* PR feedback.
This commit is contained in:
M. Zeeshan Siddiqui 2020-05-04 20:05:42 -07:00 committed by GitHub
parent 111469728f
commit a24c71af40
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 192 additions and 202 deletions

View file

@ -34,7 +34,7 @@ const Tensor& FetchTensor(const OrtValue& ort_value) {
}
void RunDropoutTest(const char* op, const bool use_mask, const std::vector<int64_t>& input_shape, float ratio = -1,
bool use_float16_ratio = false) {
bool training_mode = true, bool use_float16_ratio = false) {
OpTester t{op, k_dropout_opset_version, kOnnxDomain};
const auto input_size = std::accumulate(
@ -47,12 +47,17 @@ void RunDropoutTest(const char* op, const bool use_mask, const std::vector<int64
t.AddInput("data", input_shape, input);
if (ratio == -1) {
ratio = 0.5; // default.
t.AddInput("ratio", {}, {ratio});
} else if (use_float16_ratio) {
t.AddInput("ratio", {}, {MLFloat16(0)});
} else {
t.AddInput("ratio", {}, {ratio});
}
if (strcmp(op, "TrainableDropout") != 0 && training_mode) {
t.AddInput("training_mode", {}, {true});
}
t.AddOutput<float>("output", input_shape, input); // we'll do our own output verification
std::unique_ptr<bool[]> mask_buffer{};
@ -117,7 +122,7 @@ TEST(DropoutTest, Mask) {
}
TEST(DropoutTest, RatioLimit) {
RunDropoutTest("Dropout", true, {1000}, 0.0f);
RunDropoutTest("Dropout", true, {1000}, 0.0f, false);
}
TEST(DropoutTest, EmptyRatio) {
@ -125,7 +130,7 @@ TEST(DropoutTest, EmptyRatio) {
}
TEST(DropoutTest, Float16Ratio) {
RunDropoutTest("Dropout", true, {1000}, 0.0f, true);
RunDropoutTest("Dropout", true, {1000}, 0.0f, true, true);
}
TEST(TrainableDropoutTest, Basic) {
@ -137,15 +142,15 @@ TEST(TrainableDropoutTest, Mask) {
}
TEST(TrainableDropoutTest, RatioLimit) {
RunDropoutTest("TrainableDropout", true, {1000}, 0.0f);
RunDropoutTest("TrainableDropout", true, {1000}, 0.0f, false);
}
TEST(TrainableDropoutTest, EmptyRatio) {
RunDropoutTest("TrainableDropout", true, {1000});
RunDropoutTest("TrainableDropout", true, {1000}, -1);
}
TEST(TrainableDropoutTest, Float16Ratio) {
RunDropoutTest("TrainableDropout", true, {1000}, 0.0f, true);
RunDropoutTest("TrainableDropout", true, {1000}, 0.0f, true, true);
}
namespace {

View file

@ -44,6 +44,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, double_MLFloat16, TrainableDropout);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, double_float, TrainableDropout);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, double_double, TrainableDropout);
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
// However these types work on GPU implementation.
//class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, TrainableDropoutGrad);

View file

@ -28,10 +28,10 @@ float GetRatioOrDefault(const Tensor* ratio_tensor) {
} // namespace
// Dropout
#define REGISTER_KERNEL_TYPED(OpName, Domain, VER, T1, T2) \
#define REGISTER_KERNEL_TYPED(OpName, VER, T1, T2, Trainable) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
OpName, \
Domain, \
kOnnxDomain, \
VER, \
T1##_##T2, \
kCpuExecutionProvider, \
@ -39,7 +39,16 @@ float GetRatioOrDefault(const Tensor* ratio_tensor) {
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()), \
OpName<T1, T2>);
Dropout<T1, T2, Trainable>);
// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to
// opset-12.
REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, MLFloat16, true)
REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, float, true)
REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, double, true)
REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, MLFloat16, true)
REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, float, true)
REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, double, true)
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
// However these types work on GPU implementation.
@ -47,26 +56,22 @@ float GetRatioOrDefault(const Tensor* ratio_tensor) {
// REGISTER_KERNEL_TYPED(MLFloat16, float)
// REGISTER_KERNEL_TYPED(MLFloat16, double)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, MLFloat16)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, float)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, double)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, MLFloat16)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, float)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, double)
REGISTER_KERNEL_TYPED(Dropout, 12, float, MLFloat16, false)
REGISTER_KERNEL_TYPED(Dropout, 12, float, float, false)
REGISTER_KERNEL_TYPED(Dropout, 12, float, double, false)
REGISTER_KERNEL_TYPED(Dropout, 12, double, MLFloat16, false)
REGISTER_KERNEL_TYPED(Dropout, 12, double, float, false)
REGISTER_KERNEL_TYPED(Dropout, 12, double, double, false)
template <typename T1, typename T2>
Status Dropout<T1, T2>::Compute(OpKernelContext* context) const {
template <typename T1, typename T2, bool trainable_dropout>
Status Dropout<T1, T2, trainable_dropout>::Compute(OpKernelContext* context) const {
const Tensor* X = context->Input<Tensor>(0);
auto X_span = X->DataAsSpan<T1>();
const Tensor* ratio = context->Input<Tensor>(1); // optional
const float ratio_value = GetRatioOrDefault<T2>(ratio);
const auto& X_shape = X->Shape();
Tensor* Y = context->Output(0, X_shape);
auto Y_span = Y->MutableDataAsSpan<T1>();
Tensor* mask = context->Output(1, X_shape); // optional
std::unique_ptr<bool[]> temp_mask_buffer{}; // temporary buffer to use if mask input is not provided
auto mask_span = [&X_shape, mask, &temp_mask_buffer]() {
@ -75,16 +80,21 @@ Status Dropout<T1, T2>::Compute(OpKernelContext* context) const {
return gsl::make_span(temp_mask_buffer.get(), X_shape.Size());
}();
ORT_ENFORCE(Y->Shape() == X_shape, "X and Y should have the same shape");
ORT_ENFORCE(!mask || mask->Shape() == X_shape, "X and mask should have the same shape");
if (ratio_value == 0.0f) {
const Tensor* training_mode = context->Input<Tensor>(2);
if ((0 == ratio_value /*Backward compat with TrainableDropout*/) ||
!trainable_dropout && (training_mode == nullptr || *(training_mode->Data<bool>()) == false)) {
// drop none
if (X_span.data() != Y_span.data()) {
std::copy(X_span.begin(), X_span.end(), Y_span.begin());
}
std::fill(mask_span.begin(), mask_span.end(), true);
} else if (ratio_value < 1.0f) {
if (mask != nullptr) {
std::fill(mask_span.begin(), mask_span.end(), true);
}
} else {
// drop some
ConstEigenVectorArrayMap<T1> X_arr(X_span.data(), X_span.size());
EigenVectorArrayMap<T1> Y_arr(Y_span.data(), Y_span.size());
@ -106,33 +116,51 @@ Status Dropout<T1, T2>::Compute(OpKernelContext* context) const {
return Status::OK();
}
#define REGISTER_GRADIENT_KERNEL_TYPED(OpName, T1, T2) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
OpName, \
kMSDomain, \
1, \
T1##_##T2, \
kCpuExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()), \
DropoutGrad<T1, T2>);
// DropoutGrad
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
// REVIEW(codemzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
// However these types work on GPU implementation.
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, MLFloat16)
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, float)
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, double)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, MLFloat16)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, float)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, double)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, MLFloat16)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, float)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, double)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, float)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, double)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, float)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, double)
// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to
// opset-12.
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, float)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, double)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, float)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, double)
template <typename T1, typename T2>
Status DropoutGrad<T1, T2>::Compute(OpKernelContext* context) const {
const Tensor* dY = context->Input<Tensor>(0);
auto dY_span = dY->DataAsSpan<T1>();
const Tensor* mask = context->Input<Tensor>(1);
auto mask_span = mask->DataAsSpan<bool>();
const Tensor* ratio = context->Input<Tensor>(2); // optional
const float ratio_value = GetRatioOrDefault<T2>(ratio);
const auto& dY_shape = dY->Shape();
Tensor* dX = context->Output(0, dY_shape);
auto dX_span = dX->MutableDataAsSpan<T1>();

View file

@ -9,8 +9,8 @@
namespace onnxruntime {
namespace contrib {
template <typename T1, typename T2>
class Dropout final : public OpKernel {
template <typename T1, typename T2, bool trainable_dropout>
class Dropout final: public OpKernel {
public:
Dropout(const OpKernelInfo& info) : OpKernel{info} {
int64_t seed = 0;
@ -28,7 +28,8 @@ class Dropout final : public OpKernel {
template <typename T1, typename T2>
class DropoutGrad final : public OpKernel {
public:
DropoutGrad(const OpKernelInfo& info) : OpKernel{info} {}
DropoutGrad(const OpKernelInfo& info) : OpKernel{info} {
}
Status Compute(OpKernelContext* context) const override;
};

View file

@ -1,58 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "orttraining/training_ops/cpu/nn/dropout_op.h"
#include <chrono>
#include "core/util/math_cpuonly.h"
namespace onnxruntime {
namespace contrib {
// TrainableDropout is the same as Dropout V12.
// Registering the operator for the sake of backward compatibility.
// Give notice to the users to use Dropout V12 and then deprecate this kernel.
// TrainableDropout
#define REGISTER_KERNEL_TYPED(OpName, Domain, VER, T1, T2, ClassName) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
OpName, \
Domain, \
VER, \
T1##_##T2, \
kCpuExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()), \
ClassName<T1, T2>);
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
// However these types work on GPU implementation.
// REGISTER_KERNEL_TYPED(MLFloat16, MLFloat16)
// REGISTER_KERNEL_TYPED(MLFloat16, float)
// REGISTER_KERNEL_TYPED(MLFloat16, double)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, MLFloat16, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, float, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, double, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, MLFloat16, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, float, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, double, Dropout)
// TrainableDropoutGrad
// REVIEW(mzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
// However these types work on GPU implementation.
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, MLFloat16)
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, float)
// REGISTER_GRADIENT_KERNEL_TYPED(MLFloat16, double)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, MLFloat16, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, float, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, double, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, MLFloat16, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, float, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, double, DropoutGrad)
} // namespace contrib
} // namespace onnxruntime

View file

@ -9,33 +9,58 @@
namespace onnxruntime {
namespace cuda {
#define REGISTER_KERNEL_TYPED(OpName, Domain, VER, T1, T2, MemIndex) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
OpName, \
Domain, \
VER, \
T1##_##T2, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()) \
.InputMemoryType<OrtMemTypeCPUInput>(MemIndex), \
OpName<T1, T2>);
#define REGISTER_KERNEL_TYPED(T1, T2) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Dropout, \
kOnnxDomain, \
12, \
T1##_##T2, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()) \
.InputMemoryType<OrtMemTypeCPUInput>(1) \
.InputMemoryType<OrtMemTypeCPUInput>(2), \
Dropout<T1, T2, false>);
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, MLFloat16, MLFloat16, 1)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, MLFloat16, float, 1)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, MLFloat16, double, 1)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, MLFloat16, 1)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, float, 1)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, float, double, 1)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, MLFloat16, 1)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, float, 1)
REGISTER_KERNEL_TYPED(Dropout, kOnnxDomain, 12, double, double, 1)
REGISTER_KERNEL_TYPED(MLFloat16, MLFloat16)
REGISTER_KERNEL_TYPED(MLFloat16, float)
REGISTER_KERNEL_TYPED(MLFloat16, double)
REGISTER_KERNEL_TYPED(float, MLFloat16)
REGISTER_KERNEL_TYPED(float, float)
REGISTER_KERNEL_TYPED(float, double)
REGISTER_KERNEL_TYPED(double, MLFloat16)
REGISTER_KERNEL_TYPED(double, float)
REGISTER_KERNEL_TYPED(double, double)
#define REGISTER_TRAINABLE_KERNEL_TYPED(T1, T2) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
TrainableDropout, \
kOnnxDomain, \
9, \
T1##_##T2, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
.InputMemoryType<OrtMemTypeCPUInput>(1), \
Dropout<T1, T2, true>);
template <typename T1, typename T2>
Status Dropout<T1, T2>::ComputeInternal(OpKernelContext* context) const {
// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to
// opset-12.
REGISTER_TRAINABLE_KERNEL_TYPED(MLFloat16, MLFloat16)
REGISTER_TRAINABLE_KERNEL_TYPED(MLFloat16, float)
REGISTER_TRAINABLE_KERNEL_TYPED(MLFloat16, double)
REGISTER_TRAINABLE_KERNEL_TYPED(float, MLFloat16)
REGISTER_TRAINABLE_KERNEL_TYPED(float, float)
REGISTER_TRAINABLE_KERNEL_TYPED(float, double)
REGISTER_TRAINABLE_KERNEL_TYPED(double, MLFloat16)
REGISTER_TRAINABLE_KERNEL_TYPED(double, float)
REGISTER_TRAINABLE_KERNEL_TYPED(double, double)
template <typename T1, typename T2, bool trainable_dropout>
Status Dropout<T1, T2, trainable_dropout>::ComputeInternal(OpKernelContext* context) const {
typedef typename ToCudaType<T1>::MappedType CudaT;
//Get X_data
@ -52,12 +77,6 @@ Status Dropout<T1, T2>::ComputeInternal(OpKernelContext* context) const {
//Get mask_data
auto mask = context->Output(1, shape);
ORT_ENFORCE(!mask || mask->Shape().Size() == N);
IAllocatorUniquePtr<bool> temp_mask_buffer{}; // buffer to use if mask is not provided
bool* const mask_data = [this, N, mask, &temp_mask_buffer]() {
if (mask) return mask->MutableData<bool>();
temp_mask_buffer = GetScratchBuffer<bool>(N);
return temp_mask_buffer.get();
}();
//Get the ratio_data
float ratio_data;
@ -73,21 +92,70 @@ Status Dropout<T1, T2>::ComputeInternal(OpKernelContext* context) const {
}
ORT_ENFORCE(ratio_data >= 0.0f && ratio_data < 1.0f);
const Tensor* training_mode = context->Input<Tensor>(2);
//Check for inference mode.
if ((0 == ratio_data /*Backward compat with TrainableDropout*/) ||
(!trainable_dropout && (training_mode == nullptr || *(training_mode->Data<bool>()) == false))) {
if (Y_data != X_data) {
CUDA_CALL_THROW(cudaMemcpyAsync(Y_data, X_data, N * sizeof(T1), cudaMemcpyDeviceToDevice));
}
// If mask is requested, return all 1s.
if (mask != nullptr) {
ORT_ENFORCE(cudaMemset(mask->MutableData<bool>(), true, N * sizeof(bool)) == cudaSuccess);
}
return Status::OK();
}
IAllocatorUniquePtr<bool> temp_mask_buffer{}; // buffer to use if mask is not provided
bool* const mask_data = [this, N, mask, &temp_mask_buffer]() {
if (mask) return mask->MutableData<bool>();
temp_mask_buffer = GetScratchBuffer<bool>(N);
return temp_mask_buffer.get();
}();
PhiloxGenerator& generator = generator_ != nullptr ? *generator_.get() : PhiloxGenerator::Default();
DropoutKernelImpl(GetDeviceProp(), N, ratio_data, generator, X_data, Y_data, mask_data);
return Status::OK();
}
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, MLFloat16, MLFloat16, 2)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, MLFloat16, float, 2)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, MLFloat16, double, 2)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, MLFloat16, 2)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, float, 2)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, float, double, 2)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, MLFloat16, 2)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, float, 2)
REGISTER_KERNEL_TYPED(DropoutGrad, kMSDomain, 1, double, double, 2)
#define REGISTER_GRADIENT_KERNEL_TYPED(OpName, T1, T2) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
OpName, \
kMSDomain, \
1, \
T1##_##T2, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()) \
.InputMemoryType<OrtMemTypeCPUInput>(2), \
DropoutGrad<T1, T2>);
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, MLFloat16, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, MLFloat16, float)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, MLFloat16, double)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, float)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, float, double)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, float)
REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, double)
// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to
// opset-12.
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, MLFloat16, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, MLFloat16, float)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, MLFloat16, double)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, float)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, double)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, MLFloat16)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, float)
REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, double)
template <typename T1, typename T2>
Status DropoutGrad<T1, T2>::ComputeInternal(OpKernelContext* context) const {
@ -103,7 +171,6 @@ Status DropoutGrad<T1, T2>::ComputeInternal(OpKernelContext* context) const {
auto dX = context->Output(0, shape);
auto dX_data = reinterpret_cast<CudaT*>(dX->template MutableData<T1>());
float ratio_data;
auto ratio = context->Input<Tensor>(2);

View file

@ -9,7 +9,7 @@
namespace onnxruntime {
namespace cuda {
template <typename T1, typename T2>
template <typename T1, typename T2, bool trainable_dropout>
class Dropout final : public CudaKernel {
public:
Dropout(const OpKernelInfo& info) : CudaKernel(info), default_ratio_(0.5) {
@ -29,7 +29,9 @@ class Dropout final : public CudaKernel {
template <typename T1, typename T2>
class DropoutGrad final : public CudaKernel {
public:
DropoutGrad(const OpKernelInfo& info) : CudaKernel(info), default_ratio_(0.5) {}
DropoutGrad(const OpKernelInfo& info) : CudaKernel(info), default_ratio_(0.5) {
}
Status ComputeInternal(OpKernelContext* context) const override;
private:

View file

@ -21,9 +21,6 @@
#include <curand_kernel.h>
#include <algorithm>
#include "thrust/device_ptr.h"
#include "thrust/fill.h"
namespace onnxruntime {
namespace cuda {
@ -77,22 +74,15 @@ void DropoutKernelImpl(
const T* X_data,
T* Y_data,
bool* mask_data) {
if (ratio == 0.0f) {
if (Y_data != X_data) {
CUDA_CALL_THROW(cudaMemcpyAsync(Y_data, X_data, N * sizeof(T), cudaMemcpyDeviceToDevice));
}
thrust::fill_n(thrust::device_pointer_cast(mask_data), N, true);
} else {
const int block_size = 256;
const int blocks_per_sm = prop.maxThreadsPerMultiProcessor / block_size;
const int grid_size = std::min(prop.multiProcessorCount * blocks_per_sm, static_cast<int>(CeilDiv(N, block_size)));
const int block_size = 256;
const int blocks_per_sm = prop.maxThreadsPerMultiProcessor / block_size;
const int grid_size = std::min(prop.multiProcessorCount * blocks_per_sm, static_cast<int>(CeilDiv(N, block_size)));
// Compute the number of random numbers generated by each thread, and increment philox generator offset by that amount.
const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
auto seeds = generator.NextPhiloxSeeds(counter_offset);
// Compute the number of random numbers generated by each thread, and increment philox generator offset by that amount.
const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
auto seeds = generator.NextPhiloxSeeds(counter_offset);
DropoutKernel<T><<<grid_size, block_size, 0>>>(N, ratio, seeds, X_data, Y_data, mask_data);
}
DropoutKernel<T><<<grid_size, block_size, 0>>>(N, ratio, seeds, X_data, Y_data, mask_data);
}
#define SPECIALIZED_DROPOUT_IMPL(T) \

View file

@ -1,46 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "orttraining/training_ops/cuda/nn/dropout.h"
#include "core/providers/common.h"
namespace onnxruntime {
namespace cuda {
#define REGISTER_KERNEL_TYPED(OpName, Domain, VER, T1, T2, MemIndex, ClassName) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
OpName, \
Domain, \
VER, \
T1##_##T2, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T1>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T2>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()) \
.InputMemoryType<OrtMemTypeCPUInput>(MemIndex), \
ClassName<T1, T2>);
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, MLFloat16, MLFloat16, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, MLFloat16, float, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, MLFloat16, double, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, MLFloat16, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, float, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, float, double, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, MLFloat16, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, float, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropout, kOnnxDomain, 9, double, double, 1, Dropout)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, MLFloat16, MLFloat16, 2, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, MLFloat16, float, 2, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, MLFloat16, double, 2, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, MLFloat16, 2, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, float, 2, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, float, double, 2, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, MLFloat16, 2, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, float, 2, DropoutGrad)
REGISTER_KERNEL_TYPED(TrainableDropoutGrad, kMSDomain, 1, double, double, 2, DropoutGrad)
} // namespace cuda
} // namespace onnxruntime