From 694a4d6413eb2b9450835ccdb5bd2dc4c5ce2238 Mon Sep 17 00:00:00 2001
From: Sherlock <baihan.huang@gmail.com>
Date: Mon, 26 Oct 2020 15:15:52 -0700
Subject: [PATCH] Add more loggings for GradientBuilder (#5556)

* Add more loggings for GradientBuilder

Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
---
 .../core/graph/gradient_builder.cc            | 35 ++++++++++---------
 .../core/graph/gradient_builder_base.cc       | 34 ++++++++++++++----
 .../core/graph/gradient_builder_base.h        |  9 +++--
 3 files changed, 51 insertions(+), 27 deletions(-)
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index 1a04100a9e..9ba5655493 100644
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -246,7 +246,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMatMulGradient) {
         output_shape.push_back(B_shape[B_shape.size() - 2]);
 
         std::vector<int64_t> A_axes;
-        ComputeBroadcastBackwardAxes(A_shape, output_shape, &A_axes, nullptr);
+        ComputeBroadcastBackwardAxes(A_shape, output_shape, &A_axes, nullptr, NodeName());
 
         result.push_back(
             NodeDef("Transpose",
@@ -299,7 +299,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMatMulGradient) {
           output_shape.push_back(Y_shape[Y_shape.size() - 1]);
 
           std::vector<int64_t> B_axes;
-          ComputeBroadcastBackwardAxes(B_shape, output_shape, &B_axes, nullptr);
+          ComputeBroadcastBackwardAxes(B_shape, output_shape, &B_axes, nullptr, NodeName());
 
           result.push_back(
               NodeDef("Transpose",
@@ -473,7 +473,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetGemmGradient) {
     std::vector<Dimension> C_shape, dY_shape;
     if (GetShape(C, C_shape).IsOK() && GetShape(dY, dY_shape).IsOK()) {
       std::vector<int64_t> C_axes, dY_axes;
-      ComputeBroadcastBackwardAxes(C_shape, dY_shape, &C_axes, &dY_axes);
+      ComputeBroadcastBackwardAxes(C_shape, dY_shape, &C_axes, &dY_axes, NodeName());
 
       if (C_axes.size() > 0) {
         HandleBroadcasting(dY, C, IA("dC_reduced"), C_axes, result);
@@ -861,7 +861,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetAddSubGradient) {
   std::vector<Dimension> a_shape, b_shape;
   if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
     std::vector<int64_t> a_axes, b_axes;
-    ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
+    ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
     if (IsGradientRequiredForSrcNodeInput(0)) {
       if (a_axes.size() > 0) {
         HandleBroadcasting(GO(0), a, GI(0), a_axes, output);
@@ -932,7 +932,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMulGradient) {
   std::vector<Dimension> a_shape, b_shape;
   if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
     std::vector<int64_t> a_axes, b_axes;
-    ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
+    ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
 
     if (IsGradientRequiredForSrcNodeInput(0)) {
       output.push_back(
@@ -1008,7 +1008,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetDivGradient) {
     std::vector<Dimension> a_shape, b_shape;
     if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
       std::vector<int64_t> a_axes, b_axes;
-      ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
+      ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
 
       ArgDef tmp_grad = IA("PreReduceGrad0", OType(0));
       output.push_back(NodeDef("Div", {GO(0), I(1)}, {tmp_grad}));
@@ -1066,7 +1066,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
 
   result.push_back(NodeDef("Size", {I(0)}, {IA("Sized_X")}));
   result.push_back(NodeDef("Size", {GO(0)}, {IA("Sized_Grad")}));
-  result.push_back(NodeDef("Div",{IA("Sized_X"), IA("Sized_Grad")}, {IA("Scale")}));
+  result.push_back(NodeDef("Div", {IA("Sized_X"), IA("Sized_Grad")}, {IA("Scale")}));
   result.push_back(NodeDef(OpDef{"Scale", kMSDomain, 1},
                            {grad, IA("Scale")},
                            {IA("Scaled_Grad")},
@@ -1234,16 +1234,17 @@ IMPLEMENT_GRADIENT_BUILDER(GetGeluGradient) {
 namespace {
 std::vector<NodeDef> GetBiasGeluGradNodes(
     bool use_approximation,
-    const ArgDef& dY, const ArgDef& X, const ArgDef& B,                    // inputs
-    const ArgDef& dX, const ArgDef& dB,                                    // outputs
-    const ArgDef& b_axes, const ArgDef& b_shape, const ArgDef& x_shape) {  //intermediate args
+    const ArgDef& dY, const ArgDef& X, const ArgDef& B,  // inputs
+    const ArgDef& dX, const ArgDef& dB,                  // outputs
+    const ArgDef& b_axes, const ArgDef& b_shape, const ArgDef& x_shape,  //intermediate args
+    const std::string& node_name) {
   std::vector<Dimension> B_shape, X_shape;
   if (GetShape(B, B_shape).IsOK() && GetShape(X, X_shape).IsOK()) {
     ORT_ENFORCE(B_shape.size() == 1, "B must have exactly one dimension.");
 
-    const std::vector<int64_t> B_axes = [&B_shape, &X_shape]() {
+    const std::vector<int64_t> B_axes = [&B_shape, &X_shape, &node_name]() {
       std::vector<int64_t> result{};
-      ComputeBroadcastBackwardAxes(B_shape, X_shape, &result, nullptr);
+      ComputeBroadcastBackwardAxes(B_shape, X_shape, &result, nullptr, node_name);
       return result;
     }();
     return std::vector<NodeDef>{
@@ -1279,7 +1280,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetBiasGeluGradient) {
   ArgDef b_axes = IA("ReduceAxes_" + B.name);
   ArgDef b_shape = IA("Shape_" + B.name);
   ArgDef x_shape = IA("Shape_" + X.name);
-  return GetBiasGeluGradNodes(false, dY, X, B, dX, dB, b_axes, b_shape, x_shape);
+  return GetBiasGeluGradNodes(false, dY, X, B, dX, dB, b_axes, b_shape, x_shape, NodeName());
 }
 
 IMPLEMENT_GRADIENT_BUILDER(GetFastGeluGradient) {
@@ -1293,8 +1294,9 @@ IMPLEMENT_GRADIENT_BUILDER(GetFastGeluGradient) {
     ArgDef b_axes = IA("ReduceAxes_" + B.name);
     ArgDef b_shape = IA("Shape_" + B.name);
     ArgDef x_shape = IA("Shape_" + X.name);
-    return GetBiasGeluGradNodes(true, dY, X, B, dX, dB, b_axes, b_shape, x_shape);
+    return GetBiasGeluGradNodes(true, dY, X, B, dX, dB, b_axes, b_shape, x_shape, NodeName());
   }
+  
   if (num_src_node_inputs == 1) {  // without bias
     return std::vector<NodeDef>{
         NodeDef(OpDef{"FastGeluGrad", kMSDomain, 1},
@@ -1432,7 +1434,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetExpandGradient) {
   std::vector<Dimension> a_shape, y_shape;
   if (GetShape(a, a_shape).IsOK() && GetShape(y, y_shape).IsOK()) {
     std::vector<int64_t> a_axes;
-    ComputeBroadcastBackwardAxes(a_shape, y_shape, &a_axes, nullptr);
+    ComputeBroadcastBackwardAxes(a_shape, y_shape, &a_axes, nullptr, NodeName());
 
     if (a_axes.size() > 0) {
       HandleBroadcasting(GO(0), a, GI(0), a_axes, output);
@@ -1466,8 +1468,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetExpGradient) {
 IMPLEMENT_GRADIENT_BUILDER(GetFlattenGradient) {
   return std::vector<NodeDef>{
       NodeDef("Shape", {I(0)}, {IA("input_shape")}),
-      NodeDef("Reshape", {GO(0), IA("input_shape")}, {GI(0)})
-  };
+      NodeDef("Reshape", {GO(0), IA("input_shape")}, {GI(0)})};
 }
 
 IMPLEMENT_GRADIENT_BUILDER(GetTopKGradient) {
diff --git a/orttraining/orttraining/core/graph/gradient_builder_base.cc b/orttraining/orttraining/core/graph/gradient_builder_base.cc
index 5c5b7b3443..0c2b1b0f9e 100644
--- a/orttraining/orttraining/core/graph/gradient_builder_base.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder_base.cc
@@ -9,11 +9,31 @@
 namespace onnxruntime {
 namespace training {
 
+std::string ToString(const std::vector<Dimension>& dims) {
+  std::stringstream output;
+  output << "[";
+  if (!dims.empty()) {
+    for (auto& dim : dims) {
+      if (dim.has_dim_value()) {
+        output << dim.dim_value() << ",";
+      }
+      if (dim.has_dim_param()) {
+        output << dim.dim_param() << ",";
+      }
+    }
+    output.seekp(-1, output.cur);
+  }
+  output << "]";
+
+  return output.str();
+}
+
 void ComputeBroadcastBackwardAxes(
     const std::vector<Dimension>& A_dims,
     const std::vector<Dimension>& B_dims,
     std::vector<int64_t>* A_axes,
-    std::vector<int64_t>* B_axes) {
+    std::vector<int64_t>* B_axes,
+    const std::string& node_name) {
   if (A_axes) A_axes->clear();
   if (B_axes) B_axes->clear();
 
@@ -39,16 +59,16 @@ void ComputeBroadcastBackwardAxes(
       auto A_dim = A_dims[i].dim_param(),
            B_dim = B_dims[j].dim_param();
       if (A_dim != B_dim) {
-        ORT_THROW("Error: symbolic dimension doesn't match. Expect the same symbolic but got \"",
-                  A_dim, "\" and \"", B_dim, "\".");
+        ORT_THROW("Gradient building error for node ", node_name, ": symbolic dimension doesn't match. ",
+                  "A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
       }
     } else if (A_dims[i].has_dim_param() && B_dims[j].has_dim_value()) {
       auto A_dim = A_dims[i].dim_param();
       auto B_dim = B_dims[j].dim_value();
 
       if (B_dim != 1) {
-        ORT_THROW("Error: symbolic broadcasting requires the corresponding dimension to be 1. ",
-                  "Actually got ", B_dim);
+        ORT_THROW("Gradient building error for node ", node_name, ": symbolic broadcasting requires the B_dimension to be 1. ",
+                  "A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
       }
       if (B_axes) {
         B_axes->push_back(gsl::narrow_cast<int64_t>(k));
@@ -58,8 +78,8 @@ void ComputeBroadcastBackwardAxes(
       auto B_dim = B_dims[i].dim_param();
 
       if (A_dim != 1) {
-        ORT_THROW("Error: symbolic broadcasting requires the corresponding dimension to be 1. ",
-                  "Actually got ", A_dim);
+        ORT_THROW("Gradient building error for node ", node_name, ": symbolic broadcasting requires the A_dimension to be 1. ",
+                  "A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
       }
       if (A_axes) {
         A_axes->push_back(gsl::narrow_cast<int64_t>(k));
diff --git a/orttraining/orttraining/core/graph/gradient_builder_base.h b/orttraining/orttraining/core/graph/gradient_builder_base.h
index cf8a496f69..10eab0412a 100644
--- a/orttraining/orttraining/core/graph/gradient_builder_base.h
+++ b/orttraining/orttraining/core/graph/gradient_builder_base.h
@@ -22,7 +22,8 @@ void ComputeBroadcastBackwardAxes(
     const std::vector<Dimension>& A_dims,
     const std::vector<Dimension>& B_dims,
     std::vector<int64_t>* A_axes,
-    std::vector<int64_t>* B_axes);
+    std::vector<int64_t>* B_axes,
+    const std::string& node_name = "");
 
 void ComputeBroadcastBackwardAxesDynamic(const ArgDef& a,
                                          const ArgDef& b,
@@ -211,11 +212,11 @@ class GradientBuilderBase {
     if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
       return ConstantScalarNode(MLFloat16(math::floatToHalf(value)), {1}, arg_name);
     }
-    
+
     if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) {
       return ConstantScalarNode(BFloat16(value), {1}, arg_name);
     }
-    
+
     return ConstantScalarNode(value, {1}, arg_name);
   }
 
@@ -244,6 +245,8 @@ class GradientBuilderBase {
                                  const ArgDef& reduce_axes,
                                  std::vector<NodeDef>& output) const;
 
+  const std::string& NodeName() const { return node_->Name(); }
+
  private:
   friend class GradientGraphBuilder;