From 6eb5549cb976e34efd75e32c2e20e21beddf01c8 Mon Sep 17 00:00:00 2001 From: "M. Zeeshan Siddiqui" Date: Fri, 17 Jul 2020 13:43:25 -0700 Subject: [PATCH] Deprecate TrainableDropout (#4501) * Deprecate TrainableDropout. * Add Dropout(12) back into Megatron transformer. * Remove TrainableDropout from front-end test models. * Update baseline for front-end tests after converting test models to opset-12. * Update baseline for front-end tests after converting test models to opset-12. --- .../core/providers/cpu/nn/dropout_op.cc | 12 +-- .../core/providers/cpu/nn/dropout_op.h | 9 +-- onnxruntime/core/providers/cuda/nn/dropout.cc | 2 +- onnxruntime/core/providers/cuda/nn/dropout.h | 8 +- .../python/onnxruntime_test_ort_trainer.py | 8 +- .../test/testdata/bert_toy_postprocessed.onnx | Bin 20642195 -> 20642654 bytes .../core/framework/gradient_graph_builder.h | 1 - .../core/graph/gradient_builder.cc | 12 --- .../orttraining/core/graph/gradient_builder.h | 1 - .../core/graph/gradient_builder_registry.cc | 1 - .../core/graph/mixed_precision_transformer.cc | 4 - .../core/graph/training_op_defs.cc | 71 ----------------- .../core/optimizer/bias_dropout_fusion.cc | 21 +---- .../core/optimizer/megatron_transformer.cc | 6 +- .../core/session/training_session.cc | 13 +--- .../test/optimizer/graph_transform_test.cc | 2 - .../training_ops/cpu/nn/dropout_op_test.cc | 73 ++++-------------- .../training_ops/cpu/cpu_training_kernels.cc | 47 ----------- .../training_ops/cpu/nn/dropout_op.cc | 18 ----- .../cuda/cuda_training_kernels.cc | 4 - .../training_ops/cuda/nn/dropout.cc | 27 +++---- 21 files changed, 48 insertions(+), 292 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/dropout_op.cc b/onnxruntime/core/providers/cpu/nn/dropout_op.cc index 224638afa4..56e583bbf6 100644 --- a/onnxruntime/core/providers/cpu/nn/dropout_op.cc +++ b/onnxruntime/core/providers/cpu/nn/dropout_op.cc @@ -6,7 +6,7 @@ namespace onnxruntime { // Dropout -#define REGISTER_KERNEL_TYPED(OpName, VER, T1, T2, Trainable) \ +#define REGISTER_KERNEL_TYPED(OpName, VER, T1, T2) \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ OpName, \ kOnnxDomain, \ @@ -17,7 +17,7 @@ namespace onnxruntime { .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ - Dropout); + Dropout); // REVIEW(mzs): ConstEigenVectorArrayMap.cast +template class Dropout final: public OpKernel { public: Dropout(const OpKernelInfo& info) : OpKernel{info} { @@ -45,8 +45,8 @@ float GetRatioOrDefault(const Tensor* ratio_tensor) { } } // namespace -template -Status Dropout::Compute(OpKernelContext* context) const { +template +Status Dropout::Compute(OpKernelContext* context) const { const Tensor* X = context->Input(0); auto X_span = X->DataAsSpan(); const Tensor* ratio = context->Input(1); // optional @@ -65,8 +65,7 @@ Status Dropout::Compute(OpKernelContext* context) con ORT_ENFORCE(!mask || mask->Shape() == X_shape, "X and mask should have the same shape"); const Tensor* training_mode = context->Input(2); - if ((0 == ratio_value /*Backward compat with TrainableDropout*/) || - !trainable_dropout && (training_mode == nullptr || *(training_mode->Data()) == false)) { + if ((0 == ratio_value) || (training_mode == nullptr || *(training_mode->Data()) == false)) { // drop none if (X_span.data() != Y_span.data()) { std::copy(X_span.begin(), X_span.end(), Y_span.begin()); diff --git a/onnxruntime/core/providers/cuda/nn/dropout.cc b/onnxruntime/core/providers/cuda/nn/dropout.cc index 56b22e3593..8ef19593b9 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout.cc +++ b/onnxruntime/core/providers/cuda/nn/dropout.cc @@ -17,7 +17,7 @@ ONNX_OPERATOR_KERNEL_EX( .TypeConstraint("T2", DataTypeImpl::GetTensorType()) .InputMemoryType(1) .InputMemoryType(2), - Dropout); + Dropout); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/nn/dropout.h b/onnxruntime/core/providers/cuda/nn/dropout.h index 0ef8456f44..c331e1ac86 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout.h +++ b/onnxruntime/core/providers/cuda/nn/dropout.h @@ -5,7 +5,6 @@ #include "core/providers/cuda/cuda_common.h" #include "core/providers/cuda/nn/dropout_impl.h" -#include "core/providers/cuda/nn/dropout.h" #include "core/providers/common.h" #include "core/framework/random_seed.h" @@ -38,7 +37,6 @@ struct DropoutComputeImpl { } }; -template class Dropout final : public CudaKernel { public: Dropout(const OpKernelInfo& info) : CudaKernel(info) { @@ -55,8 +53,7 @@ class Dropout final : public CudaKernel { static constexpr float default_ratio_ = 0.5f; }; -template -Status Dropout::ComputeInternal(OpKernelContext* context) const { +Status Dropout::ComputeInternal(OpKernelContext* context) const { //Get X_data const Tensor* X = context->Input(0); if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "X Input is not available."); @@ -80,8 +77,7 @@ Status Dropout::ComputeInternal(OpKernelContext* context) con const Tensor* training_mode = context->Input(2); //Check for inference mode. - if ((0 == ratio_data /*Backward compat with TrainableDropout*/) || - (!trainable_dropout && (training_mode == nullptr || *(training_mode->Data()) == false))) { + if ((0 == ratio_data) ||(training_mode == nullptr || *(training_mode->Data()) == false)) { const void* X_data = X->DataRaw(); void* Y_data = Y->MutableDataRaw(); if (Y_data != X_data) { diff --git a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py index 8566c31577..de60953a60 100644 --- a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py +++ b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py @@ -655,8 +655,8 @@ class TestOrtTrainer(unittest.TestCase): assert np.array_equal(state_dict[key], loaded_state_dict[key]) def testBertTrainingBasic(self): - expected_losses = [11.034271, 11.125311, 11.006095, 11.046938, 11.027476, 11.015745, 11.060884, 10.971851] - expected_eval_loss = [10.95898914] + expected_losses = [11.027887, 11.108191, 11.055356, 11.040912, 10.960277, 11.02691, 11.082471, 10.920979] + expected_eval_loss = [10.976489] actual_losses, actual_eval_loss = runBertTrainingTest( gradient_accumulation_steps=1, use_mixed_precision=False, allreduce_post_accumulation=False) @@ -672,8 +672,8 @@ class TestOrtTrainer(unittest.TestCase): assert_allclose(expected_eval_loss, actual_eval_loss, rtol=rtol, err_msg="evaluation loss mismatch") def testBertTrainingGradientAccumulation(self): - expected_losses = [11.034271, 11.125311, 11.006093, 11.046929, 11.027471, 11.015731, 11.060894, 10.971855] - expected_eval_loss = [10.959011] + expected_losses = [11.027887, 11.108191, 11.055354, 11.040904, 10.960266, 11.026897, 11.082475, 10.920998] + expected_eval_loss = [10.976518] actual_losses, actual_eval_loss = runBertTrainingTest( gradient_accumulation_steps=4, use_mixed_precision=False, allreduce_post_accumulation=False) diff --git a/onnxruntime/test/testdata/bert_toy_postprocessed.onnx b/onnxruntime/test/testdata/bert_toy_postprocessed.onnx index 28d50361c4e2432639f4d5a9795f87400de6571c..bb0f72efd2ea14539c718de8910581dca87d3332 100644 GIT binary patch delta 2451 zcmbW%cXU%_7=ZC}law}>mX@^8lD1F^g%Y6TCTY@uh?cr4;zYp}V;dn#lbWU=t{{l0 zfT(~9IO+noB5qM!xG3&Da1W>(ts8va(41aMQaSwbdrnTy&3C{1p6_PKwl?$T_BL~F zhRKrNI4c^C)J?M*wF<}R4ee`ktIF4|vjvSYb&_iE*wyU%NVqZF9Q8%~(O}qTSMw8J zL?ixSC>WaRYY5i|e454J5e}=lrsEg(()5mxHnVDK3Iys~6oWd^(sfB|USi3e_Xx>) zF?(so_;9Ex>JLS2g=*Ri|Mcd7Ejvf4PK#evYueP>I<`eIsdbiATe21<7T$HQ@f&Em z^-B`#=jE8Iv*H)W5>HmEC~}v{s!?-lUB}@}H4c}B%N-JD=Rll$HV~%_B&MHwe8W!N zhWBnDu5Jh7(yd>j#yTe6Igum-sod>+xKjc;_-GpAzrm0`61No45KO@t@DU@6BCI=yM1ZANhjjfZMfTs z1S~0cWi5{Hw}w|f*;}K3E&q2WbIU5`msKb*zz7r4&;#knfEg-!A`@BYg>2-YH!R3S z9{Qj!@=<_(u%Zw}C`Nw_KnVt-6oW7rWf+2?D913^FdQQ=60+}6utS3bPPkABH$13< z7o#x-`(P~g#W?JT{ZWnasKEg^5C`F49D-ULiU~Lj6LC0>z>zo#N8=bw!m&6GlQ9Ly z;{=?DlW;P8I0b&3iaOLIfYUG)(-6ezI0MtsfDpoH#F>bo2~jj-24>2k+uNypIp?AwI&#*o;r` zDL%vJ_yS+zD}0S_@GZ7rE86fKzQ+&v5kKK){DN)x6~Cb!zvB=5iNEkS{=vW4jvWhl z_ZLHAkQgNCNJdKHPdQ3H*(I99A#qAvl1hnN S;*nHIyvln1IFyEsO6GrR{R{{I delta 1998 zcma*ncW{$+9LMp?C`})rl(ZBGEtIlKfs!X7BrTxqDTr)E3@HszDM<}!rJ@F?itN1v z6p@WGL`6jfluc2V?CFMr7K)$<>idgw$8itPKVI+4Jx?z8$uIZh^tHl>>BWT+(I%rg zJUcJw4-6P=F{mlFv>R8}MZ1zh?G?S#s>Eaj{Mr7Tpxfuq@VWz@V5Z+~HS3)!Y!-8m zfG5-ENzd}O{{L;(#Hf3(DP7E(pJLIxnrLd}_YDhrLcU;ftP+;%$;$C2D<)l(uA^~p zK|!;HI-}A_F{pM`bDL`BmQ7;USf`eFof^`dr{0p*wF|MG=>u{q#_MX&=k$^2p&aqv_MO=LTj`^ zTeL%abU;UR!ei)+E_fVW@dUb|J9?ledZ9P^;7L4%zIYn_@C@AO4-e8Y02%ONAO>MD zGVv^iAPYm`gCE&=4gm~95IM-j^B9giynqpS5ij9ogfJ4LFdF%I1+U^YjKS*|i*Xo_ z37CjUn2afyifNdR8JLM#n2kA@i#IS2^RWPL;w>!1+gOBmuoz3Q6w9z2E3gu)@GjoN zYOKLpti$_Qj}P!6KEeikj8E_>HsUjUjxVqYo3RC7;wx;$Hf+ZZ?8GkY#vT-4FTTb% z*oXZ%fP*-MZ*dq$a1_UI94BxRr*Il)a2DtA9lpm8_z^$hXZ(WmxPU@j#3lTS-%y0h zxPoF_#qYR=>-YnI;s*Z0P29rY_-8!N{$xn>5`)Aj36qqPgiB142#F#oEh!@@D~Xhp zlSE0%OU#mJNsOd|Bvw*UQb|%-Vv$sl#7W{MRVCFV36koPL`e-vO-U_DZAl$TlBBLA sSyE3@U(!I5BDwult@28h*d%sIBZ))OSmKnpB&oVh{Bh`p?$Ig#0=jZ<`2YX_ diff --git a/orttraining/orttraining/core/framework/gradient_graph_builder.h b/orttraining/orttraining/core/framework/gradient_graph_builder.h index 403d543613..1d5c239b0c 100644 --- a/orttraining/orttraining/core/framework/gradient_graph_builder.h +++ b/orttraining/orttraining/core/framework/gradient_graph_builder.h @@ -31,7 +31,6 @@ static std::unordered_map> {"Gather", {1}}, {"Reshape", {1}}, {"Expand", {1}}, - {"TrainableDropout", {1}}, {"Dropout", {1}}, {"Slice", {1, 2, 3, 4}}, {"SparseSoftmaxCrossEntropy", {1, 2}}, diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc index a823d46ba9..4d75a5f6bb 100644 --- a/orttraining/orttraining/core/graph/gradient_builder.cc +++ b/orttraining/orttraining/core/graph/gradient_builder.cc @@ -521,18 +521,6 @@ IMPLEMENT_GRADIENT_BUILDER(GetDropoutGradient) { {SrcNodeAttributes()})}; } -IMPLEMENT_GRADIENT_BUILDER(GetTrainableDropoutGradient) { - std::vector inputs{GO(0), O(1)}; - for (int i = 1; i < GetSrcNodeInputSize(); i++) { - inputs.push_back(I(i)); - } - return std::vector{ - NodeDef(OpDef{"TrainableDropoutGrad", kMSDomain, 1}, - inputs, - {GI(0)}, - {SrcNodeAttributes()})}; -} - IMPLEMENT_GRADIENT_BUILDER(GetConvGradient) { std::vector outputs; for (int i = 0; i < 3; i++) { diff --git a/orttraining/orttraining/core/graph/gradient_builder.h b/orttraining/orttraining/core/graph/gradient_builder.h index 9a32e421bf..83f21d898b 100644 --- a/orttraining/orttraining/core/graph/gradient_builder.h +++ b/orttraining/orttraining/core/graph/gradient_builder.h @@ -43,7 +43,6 @@ DECLARE_GRADIENT_BUILDER(GetSoftmaxCrossEntropyLossGradient) DECLARE_GRADIENT_BUILDER(GetGlobalAveragePoolGradient) DECLARE_GRADIENT_BUILDER(GetGemmGradient) DECLARE_GRADIENT_BUILDER(GetDropoutGradient) -DECLARE_GRADIENT_BUILDER(GetTrainableDropoutGradient) DECLARE_GRADIENT_BUILDER(GetGatherNDGradient) DECLARE_GRADIENT_BUILDER(GetGatherElementsGradient) DECLARE_GRADIENT_BUILDER(GetGeluGradient) diff --git a/orttraining/orttraining/core/graph/gradient_builder_registry.cc b/orttraining/orttraining/core/graph/gradient_builder_registry.cc index 94b4e1e096..71f62325f0 100644 --- a/orttraining/orttraining/core/graph/gradient_builder_registry.cc +++ b/orttraining/orttraining/core/graph/gradient_builder_registry.cc @@ -71,7 +71,6 @@ void GradientBuilderRegistry::RegisterGradientBuilders() { REGISTER_GRADIENT_BUILDER("GlobalAveragePool", GetGlobalAveragePoolGradient); REGISTER_GRADIENT_BUILDER("AveragePool", GetAveragePoolGradient); REGISTER_GRADIENT_BUILDER("Dropout", GetDropoutGradient) - REGISTER_GRADIENT_BUILDER("TrainableDropout", GetTrainableDropoutGradient) REGISTER_GRADIENT_BUILDER("GatherND", GetGatherNDGradient) REGISTER_GRADIENT_BUILDER("GatherElements", GetGatherElementsGradient) REGISTER_GRADIENT_BUILDER("Gelu", GetGeluGradient) diff --git a/orttraining/orttraining/core/graph/mixed_precision_transformer.cc b/orttraining/orttraining/core/graph/mixed_precision_transformer.cc index 52ce76d8c8..feff23c9db 100644 --- a/orttraining/orttraining/core/graph/mixed_precision_transformer.cc +++ b/orttraining/orttraining/core/graph/mixed_precision_transformer.cc @@ -41,8 +41,6 @@ bool IsFP32Node(const Node* node) { // At present, we use these table to identify which input needs to be keep in FP32 static const std::unordered_map> stage1_fp32_node_args = { - {"TrainableDropout", {1}}, - {"TrainableDropoutGrad", {2}}, {"Dropout", {1}}, {"DropoutGrad", {2}}, }; @@ -50,8 +48,6 @@ static const std::unordered_map> stage1_fp32_node_ // Currently the list here is same as stage1 above due to empty FP32_Nodes. // It's possibile we will have more FP32 nodes added, this map will also be extended. static const std::unordered_map> stage2_fp32_node_args = { - {"TrainableDropout", {1}}, - {"TrainableDropoutGrad", {2}}, {"Dropout", {1}}, {"DropoutGrad", {2}}, }; diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc index 23d99cf071..7b4478cd38 100644 --- a/orttraining/orttraining/core/graph/training_op_defs.cc +++ b/orttraining/orttraining/core/graph/training_op_defs.cc @@ -1058,77 +1058,6 @@ Example 4: } }); - ONNX_CONTRIB_OPERATOR_SCHEMA(TrainableDropout) - .SetDomain(kOnnxDomain) - .SinceVersion(9) - .SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL) - .SetDoc("TrainableDropout") - .Attr("seed", "(Optional) Seed to the random generator, if not specified we will auto generate one.", AttributeProto::INT, OPTIONAL_VALUE) - .AllowUncheckedAttributes() - .Input(0, "data", "The input data as Tensor.", "T") - .Input(1, "ratio", - "The ratio of random dropout, with value in [0, 1). If this input was not set, " - "or if it was set to 0, the output would be a simple copy of the input. " - "If it's non-zero, output will be a random dropout of input, which is typically " - "the case during training.", - "T1", - OpSchema::Optional) - .Output(0, "output", "The output.", "T") - .Output(1, "mask", "The output mask.", "T2", OpSchema::Optional) - .TypeConstraint( - "T", - {"tensor(float16)", "tensor(float)", "tensor(double)"}, - "Constrain input and output types to float tensors.") - .TypeConstraint( - "T1", - {"tensor(float16)", "tensor(float)", "tensor(double)"}, - "Constrain input 'ratio' types to float tensors.") - .TypeConstraint( - "T2", - {"tensor(bool)"}, - "Constrain output 'mask' types to boolean tensors.") - .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - propagateShapeAndTypeFromFirstInput(ctx); - if (ctx.getNumOutputs() == 2) { - updateOutputElemType(ctx, 1, ONNX_NAMESPACE::TensorProto::BOOL); - if (hasNInputShapes(ctx, 1)) { - propagateShapeFromInputToOutput(ctx, 0, 1); - } - } - }); - - ONNX_CONTRIB_OPERATOR_SCHEMA(TrainableDropoutGrad) - .SetDomain(kMSDomain) - .SinceVersion(1) - .SetDoc("TrainableDropoutGrad") - .AllowUncheckedAttributes() - .Input(0, "dy", "The gradient tensor from output.", "T") - .Input(1, "mask", - "The mask tensor of the dropout. ", "T2") - .Input(2, "ratio", - "The ratio of random dropout, with value in [0, 1). If this input was not set, " - "or if it was set to 0, the output would be a simple copy of the input. " - "If it's non-zero, output will be a random dropout of input, which is typically " - "the case during training.", - "T1", - OpSchema::Optional) - .Output(0, "dx", "Gradient of the input.", "T") - .TypeConstraint( - "T", - {"tensor(float16)", "tensor(float)", "tensor(double)"}, - "Constrain input and output types to float tensors.") - .TypeConstraint( - "T1", - {"tensor(float)"}, - "Constrain input 'ratio' types to float tensors.") - .TypeConstraint( - "T2", - {"tensor(bool)"}, - "Constrain 'mask' types to boolean tensors.") - .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - propagateShapeAndTypeFromFirstInput(ctx); - }); - ONNX_CONTRIB_OPERATOR_SCHEMA(DropoutGrad) .SetDomain(kMSDomain) .SinceVersion(1) diff --git a/orttraining/orttraining/core/optimizer/bias_dropout_fusion.cc b/orttraining/orttraining/core/optimizer/bias_dropout_fusion.cc index d0e467c35a..5aa773efe6 100644 --- a/orttraining/orttraining/core/optimizer/bias_dropout_fusion.cc +++ b/orttraining/orttraining/core/optimizer/bias_dropout_fusion.cc @@ -127,8 +127,7 @@ Status BiasDropoutFusion::ApplyImpl(Graph& graph, bool& modified, int graph_leve } const Node& next_node = (*next_node_itr); - if (!(graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "Dropout", {12}, kOnnxDomain) || - graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "TrainableDropout", {9}, kOnnxDomain)) || + if (!(graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "Dropout", {12}, kOnnxDomain)) || next_node.GetExecutionProviderType() != node.GetExecutionProviderType()) { continue; } @@ -149,22 +148,8 @@ Status BiasDropoutFusion::ApplyImpl(Graph& graph, bool& modified, int graph_leve dropout_input.push_back(dropout_node.MutableInputDefs()[1]); // ratio } - // populate training_mode - bool is_trainable_dropout = (dropout_node.OpType() == "TrainableDropout"); - if (is_trainable_dropout) { - // Create training_mode initializer - ONNX_NAMESPACE::TensorProto training_mode_initializer; - training_mode_initializer.set_name(graph.GenerateNodeArgName("training_mode")); - training_mode_initializer.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_BOOL); - const bool data = true; - training_mode_initializer.set_raw_data(&data, sizeof(bool)); - - NodeArg& training_mode_node_arg = graph_utils::AddInitializer(graph, training_mode_initializer); - dropout_input.push_back(&training_mode_node_arg); - } else { - if (dropout_node.InputDefs().size() > 2) { - dropout_input.push_back(dropout_node.MutableInputDefs()[2]); - } + if (dropout_node.InputDefs().size() > 2) { + dropout_input.push_back(dropout_node.MutableInputDefs()[2]); } const std::string op_type = "BiasDropout"; diff --git a/orttraining/orttraining/core/optimizer/megatron_transformer.cc b/orttraining/orttraining/core/optimizer/megatron_transformer.cc index 999ab0fbea..d4fc0c61ec 100644 --- a/orttraining/orttraining/core/optimizer/megatron_transformer.cc +++ b/orttraining/orttraining/core/optimizer/megatron_transformer.cc @@ -44,7 +44,6 @@ const OpInfo div_info = OpInfo("Div", opset_v7); const OpInfo mul_info = OpInfo("Mul", opset_v7); const OpInfo sub_info = OpInfo("Sub", opset_v7); const OpInfo softmax_info = OpInfo("Softmax", opset_v1_11); -const OpInfo trainable_dropout_info = OpInfo("TrainableDropout", opset_v9, kOnnxDomain); const OpInfo dropout_info = OpInfo("Dropout", opset_v12); struct NodeInfo { @@ -392,7 +391,7 @@ Status MegatronTransformer::TransformSelfAttention(Graph& graph, bool& modified, NodeInfo({mul_info}), NodeInfo({sub_info}), NodeInfo({softmax_info}), - NodeInfo({trainable_dropout_info, dropout_info}, false), // -6 + NodeInfo({dropout_info}, false), // -6 NodeInfo({matmul_info}), NodeInfo({transpose_info}), NodeInfo({reshape_info}), @@ -603,8 +602,7 @@ Status MegatronTransformer::TransformDropout(Graph& graph, bool& modified, int g continue; } - if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Dropout", opset_v12) && - !graph_utils::IsSupportedOptypeVersionAndDomain(node, "TrainableDropout", opset_v9, kOnnxDomain)) { + if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Dropout", opset_v12)) { continue; } diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc index be9f47ab4e..1bf97ce472 100644 --- a/orttraining/orttraining/core/session/training_session.cc +++ b/orttraining/orttraining/core/session/training_session.cc @@ -868,8 +868,6 @@ common::Status TrainingSession::Run(const RunOptions& run_options, IOBinding& io } static const std::unordered_set Nodes_Need_Eval_Feeds = { - // TODO remove this once ONNX TrainableDropout is completely deprecated. - "TrainableDropout", "Dropout", }; Status TrainingSession::SetEvalFeedNames() { @@ -881,16 +879,7 @@ Status TrainingSession::SetEvalFeedNames() { auto it = Nodes_Need_Eval_Feeds.find(node.OpType()); if(it != Nodes_Need_Eval_Feeds.cend()) { // The opset is < 12, add each ratio input to graph inputs for overriding. - // Needs to be removed when TrainableDropout is deprecated. - if(it->compare("TrainableDropout") == 0) { - auto& ratio_name = node.InputDefs()[1]->Name(); - dropout_eval_feeds_.insert(ratio_name); - ORT_ENFORCE(model_->MainGraph().GetProducerNode(ratio_name) == nullptr, - "Input: " + ratio_name + " should not have any producer node."); - defs.AddGraphInputs({ratio_name}); - } - // Found an opset-12 dropout node, replace initializer name. - else if(node.InputArgCount().size() > 2) { + if(node.InputArgCount().size() > 2) { auto& mode_input = node.MutableInputDefs()[2]; const ONNX_NAMESPACE::TensorProto* mode_initializer = nullptr; if (!graph.GetInitializedTensor(training_mode_string_, mode_initializer)) { diff --git a/orttraining/orttraining/test/optimizer/graph_transform_test.cc b/orttraining/orttraining/test/optimizer/graph_transform_test.cc index 873c7ade7e..79255fcfcc 100644 --- a/orttraining/orttraining/test/optimizer/graph_transform_test.cc +++ b/orttraining/orttraining/test/optimizer/graph_transform_test.cc @@ -61,7 +61,6 @@ static void TestBiasDropoutFusion(const PathString& file_path, const logging::Lo ASSERT_EQ(op_to_count["Add"], add_count); ASSERT_EQ(op_to_count["Dropout"], 0); - ASSERT_EQ(op_to_count["TrainableDropout"], 0); ASSERT_EQ(op_to_count["BiasDropout"], 1); } @@ -71,7 +70,6 @@ TEST_F(GraphTransformationTests, BiasDropoutFusionTest) { TestBiasDropoutFusion(MODEL_FOLDER "fusion/bias_dropout_residual_fusion1.onnx", *logger_); TestBiasDropoutFusion(MODEL_FOLDER "fusion/bias_dropout_residual_fusion2.onnx", *logger_); TestBiasDropoutFusion(MODEL_FOLDER "fusion/bias_dropout_residual_fusion_mismatch.onnx", *logger_, 1); - TestBiasDropoutFusion(MODEL_FOLDER "fusion/bias_trainabledropout_residual_fusion.onnx", *logger_); } Node* GetNodeByName(Graph& graph, std::string node_name) { diff --git a/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc index 70fff78a5d..1f08f1eb54 100644 --- a/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc @@ -34,9 +34,9 @@ const Tensor& FetchTensor(const OrtValue& ort_value) { return ort_value.Get(); } -void RunDropoutTest(const char* op, const bool use_mask, const std::vector& input_shape, float ratio = -1.0f, +void RunDropoutTest(const bool use_mask, const std::vector& input_shape, float ratio = -1.0f, bool training_mode = true, bool use_float16_ratio = false) { - OpTester t{op, k_dropout_opset_version, kOnnxDomain}; + OpTester t{"Dropout", k_dropout_opset_version, kOnnxDomain}; const auto input_size = std::accumulate( input_shape.begin(), input_shape.end(), static_cast(1), std::multiplies<>{}); @@ -63,12 +63,10 @@ void RunDropoutTest(const char* op, const bool use_mask, const std::vector("output", input_shape, input); // we'll do our own output verification - std::unique_ptr mask_buffer{}; if (use_mask) { mask_buffer = onnxruntime::make_unique(input_size); @@ -124,35 +122,19 @@ void RunDropoutTest(const char* op, const bool use_mask, const std::vector& input_dims, bool default_ratio = true) { +void RunDropoutGradTest(float ratio, const std::vector& input_dims, bool default_ratio = true) { const auto input_shape = TensorShape(input_dims); - OpTester test(op, 1, kMSDomain); + OpTester test("DropoutGrad", 1, kMSDomain); if (default_ratio) { ratio = 0.5f; } @@ -312,13 +294,9 @@ void RunDropoutGradTest(const char* op, float ratio, const std::vector& } else { test.AddMissingOptionalInput(); } - - if (strcmp(op, "TrainableDropoutGrad") != 0) { - test.AddInput("training_mode", {}, {true}); - } - + + test.AddInput("training_mode", {}, {true}); test.AddOutput("dx", input_shape.GetDims(), dx_data); - test.Run(); } } // namespace @@ -327,38 +305,19 @@ void RunDropoutGradTest(const char* op, float ratio, const std::vector& TEST(DropoutGradTest, Basic) { //Ratio 0.2, 1D - RunDropoutGradTest("DropoutGrad", 0.2f, {16}, false); + RunDropoutGradTest(0.2f, {16}, false); //Ratio 0.3, 2D - RunDropoutGradTest("DropoutGrad", 0.3f, {8, 2}, false); + RunDropoutGradTest(0.3f, {8, 2}, false); //Ratio 0.4, 3D - RunDropoutGradTest("DropoutGrad", 0.4f, {2, 4, 2}, false); + RunDropoutGradTest(0.4f, {2, 4, 2}, false); //default Ratio, 3D - RunDropoutGradTest("DropoutGrad", 0.5f, {2, 4, 2}); + RunDropoutGradTest(0.5f, {2, 4, 2}); } - TEST(DropoutGradTest, RatioLimit) { - RunDropoutGradTest("DropoutGrad", 0.0f, {16}, false); -} - -TEST(TrainableDropoutGradTest, Basic) { - //Ratio 0.2, 1D - RunDropoutGradTest("TrainableDropoutGrad", 0.2f, {16}, false); - - //Ratio 0.3, 2D - RunDropoutGradTest("TrainableDropoutGrad", 0.3f, {8, 2}, false); - - //Ratio 0.4, 3D - RunDropoutGradTest("TrainableDropoutGrad", 0.4f, {2, 4, 2}, false); - - //default Ratio, 3D - RunDropoutGradTest("TrainableDropoutGrad", 0.5f, {2, 4, 2}); -} - -TEST(TrainableDropoutGradTest, RatioLimit) { - RunDropoutGradTest("TrainableDropoutGrad", 0.0f, {16}, false); + RunDropoutGradTest(0.0f, {16}, false); } } // namespace test diff --git a/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc b/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc index a302ad0f52..04f3bf8428 100644 --- a/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc +++ b/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc @@ -33,31 +33,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Ave class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, MaxPoolGrad); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, GatherGrad); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, GeluGrad); -// REVIEW(mzs): ConstEigenVectorArrayMap.cast, // REVIEW(mzs): ConstEigenVectorArrayMap.cast, - //BuildKernelCreateInfo, - //BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // REVIEW(mzs): ConstEigenVectorArrayMap.cast, - //BuildKernelCreateInfo, - //BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // REVIEW(mzs): ConstEigenVectorArrayMap.cast, //BuildKernelCreateInfo, //BuildKernelCreateInfo, diff --git a/orttraining/orttraining/training_ops/cpu/nn/dropout_op.cc b/orttraining/orttraining/training_ops/cpu/nn/dropout_op.cc index 0ad839a819..52f3ba9b0e 100644 --- a/orttraining/orttraining/training_ops/cpu/nn/dropout_op.cc +++ b/orttraining/orttraining/training_ops/cpu/nn/dropout_op.cc @@ -41,15 +41,6 @@ float GetRatioOrDefault(const Tensor* ratio_tensor) { .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ onnxruntime::Dropout); -// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to -// opset-12. -REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, MLFloat16, true) -REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, float, true) -REGISTER_KERNEL_TYPED(TrainableDropout, 9, float, double, true) -REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, MLFloat16, true) -REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, float, true) -REGISTER_KERNEL_TYPED(TrainableDropout, 9, double, double, true) - #define REGISTER_GRADIENT_KERNEL_TYPED(OpName, T1, T2) \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ OpName, \ @@ -77,15 +68,6 @@ REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, MLFloat16) REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, float) REGISTER_GRADIENT_KERNEL_TYPED(DropoutGrad, double, double) -// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to -// opset-12. -REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, MLFloat16) -REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, float) -REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, float, double) -REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, MLFloat16) -REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, float) -REGISTER_GRADIENT_KERNEL_TYPED(TrainableDropoutGrad, double, double) - template Status DropoutGrad::Compute(OpKernelContext* context) const { const Tensor* dY = context->Input(0); diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc index e3a3e87662..7ee7e79756 100644 --- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc +++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc @@ -53,8 +53,6 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, BatchNormalizationGrad); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GatherGrad); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasDropout); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, TrainableDropout); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, TrainableDropoutGrad); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, DropoutGrad); // TODO: decprecate GatherND-1 after updating training models to opset-12 @@ -155,8 +153,6 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, BuildKernelCreateInfo, // TODO: decprecate GatherND-1 after updating training models to opset-12 diff --git a/orttraining/orttraining/training_ops/cuda/nn/dropout.cc b/orttraining/orttraining/training_ops/cuda/nn/dropout.cc index 425e68827b..08b521d413 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/dropout.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/dropout.cc @@ -3,26 +3,12 @@ #include "core/framework/random_seed.h" #include "orttraining/training_ops/cuda/nn/dropout.h" -#include "core/providers/cuda/nn/dropout.h" #include "core/providers/cuda/cuda_common.h" #include "core/providers/common.h" namespace onnxruntime { namespace cuda { -// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to -// opset-12. -ONNX_OPERATOR_KERNEL_EX( - TrainableDropout, - kOnnxDomain, - 9, - kCudaExecutionProvider, - KernelDefBuilder() - .TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes()) - .TypeConstraint("T1", DataTypeImpl::AllIEEEFloatTensorTypes()) - .InputMemoryType(1), - Dropout); - #define REGISTER_GRADIENT_KERNEL(OpName) \ ONNX_OPERATOR_KERNEL_EX( \ OpName, \ @@ -38,10 +24,6 @@ ONNX_OPERATOR_KERNEL_EX( REGISTER_GRADIENT_KERNEL(DropoutGrad) -// Temporary for backward compatibility, will eventually get rid of TrainableDropout when PyTorch exporter will move to -// opset-12. -REGISTER_GRADIENT_KERNEL(TrainableDropoutGrad) - template struct DropoutGradComputeImpl { void operator()(const int64_t N, @@ -57,6 +39,15 @@ struct DropoutGradComputeImpl { } }; +// REVIEW(codemzs): Common out this structure because it is also used in Dropout forward op. +template +struct GetRatioDataImpl { + void operator()(const Tensor* ratio, float& ratio_data) const { + ratio_data = static_cast(*(ratio->template Data())); + ORT_ENFORCE(ratio_data >= 0.0f && ratio_data < 1.0f, "ratio_data is outside range [0, 1)"); + } +}; + Status DropoutGrad::ComputeInternal(OpKernelContext* context) const { auto dY = context->Input(0); const TensorShape& shape = dY->Shape();