From 694a4d6413eb2b9450835ccdb5bd2dc4c5ce2238 Mon Sep 17 00:00:00 2001 From: Sherlock Date: Mon, 26 Oct 2020 15:15:52 -0700 Subject: [PATCH] Add more loggings for GradientBuilder (#5556) * Add more loggings for GradientBuilder Co-authored-by: Sherlock Huang --- .../core/graph/gradient_builder.cc | 35 ++++++++++--------- .../core/graph/gradient_builder_base.cc | 34 ++++++++++++++---- .../core/graph/gradient_builder_base.h | 9 +++-- 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc index 1a04100a9e..9ba5655493 100644 --- a/orttraining/orttraining/core/graph/gradient_builder.cc +++ b/orttraining/orttraining/core/graph/gradient_builder.cc @@ -246,7 +246,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMatMulGradient) { output_shape.push_back(B_shape[B_shape.size() - 2]); std::vector A_axes; - ComputeBroadcastBackwardAxes(A_shape, output_shape, &A_axes, nullptr); + ComputeBroadcastBackwardAxes(A_shape, output_shape, &A_axes, nullptr, NodeName()); result.push_back( NodeDef("Transpose", @@ -299,7 +299,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMatMulGradient) { output_shape.push_back(Y_shape[Y_shape.size() - 1]); std::vector B_axes; - ComputeBroadcastBackwardAxes(B_shape, output_shape, &B_axes, nullptr); + ComputeBroadcastBackwardAxes(B_shape, output_shape, &B_axes, nullptr, NodeName()); result.push_back( NodeDef("Transpose", @@ -473,7 +473,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetGemmGradient) { std::vector C_shape, dY_shape; if (GetShape(C, C_shape).IsOK() && GetShape(dY, dY_shape).IsOK()) { std::vector C_axes, dY_axes; - ComputeBroadcastBackwardAxes(C_shape, dY_shape, &C_axes, &dY_axes); + ComputeBroadcastBackwardAxes(C_shape, dY_shape, &C_axes, &dY_axes, NodeName()); if (C_axes.size() > 0) { HandleBroadcasting(dY, C, IA("dC_reduced"), C_axes, result); @@ -861,7 +861,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetAddSubGradient) { std::vector a_shape, b_shape; if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) { std::vector a_axes, b_axes; - ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes); + ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName()); if (IsGradientRequiredForSrcNodeInput(0)) { if (a_axes.size() > 0) { HandleBroadcasting(GO(0), a, GI(0), a_axes, output); @@ -932,7 +932,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMulGradient) { std::vector a_shape, b_shape; if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) { std::vector a_axes, b_axes; - ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes); + ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName()); if (IsGradientRequiredForSrcNodeInput(0)) { output.push_back( @@ -1008,7 +1008,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetDivGradient) { std::vector a_shape, b_shape; if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) { std::vector a_axes, b_axes; - ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes); + ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName()); ArgDef tmp_grad = IA("PreReduceGrad0", OType(0)); output.push_back(NodeDef("Div", {GO(0), I(1)}, {tmp_grad})); @@ -1066,7 +1066,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) { result.push_back(NodeDef("Size", {I(0)}, {IA("Sized_X")})); result.push_back(NodeDef("Size", {GO(0)}, {IA("Sized_Grad")})); - result.push_back(NodeDef("Div",{IA("Sized_X"), IA("Sized_Grad")}, {IA("Scale")})); + result.push_back(NodeDef("Div", {IA("Sized_X"), IA("Sized_Grad")}, {IA("Scale")})); result.push_back(NodeDef(OpDef{"Scale", kMSDomain, 1}, {grad, IA("Scale")}, {IA("Scaled_Grad")}, @@ -1234,16 +1234,17 @@ IMPLEMENT_GRADIENT_BUILDER(GetGeluGradient) { namespace { std::vector GetBiasGeluGradNodes( bool use_approximation, - const ArgDef& dY, const ArgDef& X, const ArgDef& B, // inputs - const ArgDef& dX, const ArgDef& dB, // outputs - const ArgDef& b_axes, const ArgDef& b_shape, const ArgDef& x_shape) { //intermediate args + const ArgDef& dY, const ArgDef& X, const ArgDef& B, // inputs + const ArgDef& dX, const ArgDef& dB, // outputs + const ArgDef& b_axes, const ArgDef& b_shape, const ArgDef& x_shape, //intermediate args + const std::string& node_name) { std::vector B_shape, X_shape; if (GetShape(B, B_shape).IsOK() && GetShape(X, X_shape).IsOK()) { ORT_ENFORCE(B_shape.size() == 1, "B must have exactly one dimension."); - const std::vector B_axes = [&B_shape, &X_shape]() { + const std::vector B_axes = [&B_shape, &X_shape, &node_name]() { std::vector result{}; - ComputeBroadcastBackwardAxes(B_shape, X_shape, &result, nullptr); + ComputeBroadcastBackwardAxes(B_shape, X_shape, &result, nullptr, node_name); return result; }(); return std::vector{ @@ -1279,7 +1280,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetBiasGeluGradient) { ArgDef b_axes = IA("ReduceAxes_" + B.name); ArgDef b_shape = IA("Shape_" + B.name); ArgDef x_shape = IA("Shape_" + X.name); - return GetBiasGeluGradNodes(false, dY, X, B, dX, dB, b_axes, b_shape, x_shape); + return GetBiasGeluGradNodes(false, dY, X, B, dX, dB, b_axes, b_shape, x_shape, NodeName()); } IMPLEMENT_GRADIENT_BUILDER(GetFastGeluGradient) { @@ -1293,8 +1294,9 @@ IMPLEMENT_GRADIENT_BUILDER(GetFastGeluGradient) { ArgDef b_axes = IA("ReduceAxes_" + B.name); ArgDef b_shape = IA("Shape_" + B.name); ArgDef x_shape = IA("Shape_" + X.name); - return GetBiasGeluGradNodes(true, dY, X, B, dX, dB, b_axes, b_shape, x_shape); + return GetBiasGeluGradNodes(true, dY, X, B, dX, dB, b_axes, b_shape, x_shape, NodeName()); } + if (num_src_node_inputs == 1) { // without bias return std::vector{ NodeDef(OpDef{"FastGeluGrad", kMSDomain, 1}, @@ -1432,7 +1434,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetExpandGradient) { std::vector a_shape, y_shape; if (GetShape(a, a_shape).IsOK() && GetShape(y, y_shape).IsOK()) { std::vector a_axes; - ComputeBroadcastBackwardAxes(a_shape, y_shape, &a_axes, nullptr); + ComputeBroadcastBackwardAxes(a_shape, y_shape, &a_axes, nullptr, NodeName()); if (a_axes.size() > 0) { HandleBroadcasting(GO(0), a, GI(0), a_axes, output); @@ -1466,8 +1468,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetExpGradient) { IMPLEMENT_GRADIENT_BUILDER(GetFlattenGradient) { return std::vector{ NodeDef("Shape", {I(0)}, {IA("input_shape")}), - NodeDef("Reshape", {GO(0), IA("input_shape")}, {GI(0)}) - }; + NodeDef("Reshape", {GO(0), IA("input_shape")}, {GI(0)})}; } IMPLEMENT_GRADIENT_BUILDER(GetTopKGradient) { diff --git a/orttraining/orttraining/core/graph/gradient_builder_base.cc b/orttraining/orttraining/core/graph/gradient_builder_base.cc index 5c5b7b3443..0c2b1b0f9e 100644 --- a/orttraining/orttraining/core/graph/gradient_builder_base.cc +++ b/orttraining/orttraining/core/graph/gradient_builder_base.cc @@ -9,11 +9,31 @@ namespace onnxruntime { namespace training { +std::string ToString(const std::vector& dims) { + std::stringstream output; + output << "["; + if (!dims.empty()) { + for (auto& dim : dims) { + if (dim.has_dim_value()) { + output << dim.dim_value() << ","; + } + if (dim.has_dim_param()) { + output << dim.dim_param() << ","; + } + } + output.seekp(-1, output.cur); + } + output << "]"; + + return output.str(); +} + void ComputeBroadcastBackwardAxes( const std::vector& A_dims, const std::vector& B_dims, std::vector* A_axes, - std::vector* B_axes) { + std::vector* B_axes, + const std::string& node_name) { if (A_axes) A_axes->clear(); if (B_axes) B_axes->clear(); @@ -39,16 +59,16 @@ void ComputeBroadcastBackwardAxes( auto A_dim = A_dims[i].dim_param(), B_dim = B_dims[j].dim_param(); if (A_dim != B_dim) { - ORT_THROW("Error: symbolic dimension doesn't match. Expect the same symbolic but got \"", - A_dim, "\" and \"", B_dim, "\"."); + ORT_THROW("Gradient building error for node ", node_name, ": symbolic dimension doesn't match. ", + "A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims)); } } else if (A_dims[i].has_dim_param() && B_dims[j].has_dim_value()) { auto A_dim = A_dims[i].dim_param(); auto B_dim = B_dims[j].dim_value(); if (B_dim != 1) { - ORT_THROW("Error: symbolic broadcasting requires the corresponding dimension to be 1. ", - "Actually got ", B_dim); + ORT_THROW("Gradient building error for node ", node_name, ": symbolic broadcasting requires the B_dimension to be 1. ", + "A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims)); } if (B_axes) { B_axes->push_back(gsl::narrow_cast(k)); @@ -58,8 +78,8 @@ void ComputeBroadcastBackwardAxes( auto B_dim = B_dims[i].dim_param(); if (A_dim != 1) { - ORT_THROW("Error: symbolic broadcasting requires the corresponding dimension to be 1. ", - "Actually got ", A_dim); + ORT_THROW("Gradient building error for node ", node_name, ": symbolic broadcasting requires the A_dimension to be 1. ", + "A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims)); } if (A_axes) { A_axes->push_back(gsl::narrow_cast(k)); diff --git a/orttraining/orttraining/core/graph/gradient_builder_base.h b/orttraining/orttraining/core/graph/gradient_builder_base.h index cf8a496f69..10eab0412a 100644 --- a/orttraining/orttraining/core/graph/gradient_builder_base.h +++ b/orttraining/orttraining/core/graph/gradient_builder_base.h @@ -22,7 +22,8 @@ void ComputeBroadcastBackwardAxes( const std::vector& A_dims, const std::vector& B_dims, std::vector* A_axes, - std::vector* B_axes); + std::vector* B_axes, + const std::string& node_name = ""); void ComputeBroadcastBackwardAxesDynamic(const ArgDef& a, const ArgDef& b, @@ -211,11 +212,11 @@ class GradientBuilderBase { if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) { return ConstantScalarNode(MLFloat16(math::floatToHalf(value)), {1}, arg_name); } - + if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) { return ConstantScalarNode(BFloat16(value), {1}, arg_name); } - + return ConstantScalarNode(value, {1}, arg_name); } @@ -244,6 +245,8 @@ class GradientBuilderBase { const ArgDef& reduce_axes, std::vector& output) const; + const std::string& NodeName() const { return node_->Name(); } + private: friend class GradientGraphBuilder;