Add more loggings for GradientBuilder (#5556)

* Add more loggings for GradientBuilder

Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
This commit is contained in:
Sherlock 2020-10-26 15:15:52 -07:00 committed by GitHub
parent 68fe722691
commit 694a4d6413
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 51 additions and 27 deletions

View file

@ -246,7 +246,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMatMulGradient) {
output_shape.push_back(B_shape[B_shape.size() - 2]);
std::vector<int64_t> A_axes;
ComputeBroadcastBackwardAxes(A_shape, output_shape, &A_axes, nullptr);
ComputeBroadcastBackwardAxes(A_shape, output_shape, &A_axes, nullptr, NodeName());
result.push_back(
NodeDef("Transpose",
@ -299,7 +299,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMatMulGradient) {
output_shape.push_back(Y_shape[Y_shape.size() - 1]);
std::vector<int64_t> B_axes;
ComputeBroadcastBackwardAxes(B_shape, output_shape, &B_axes, nullptr);
ComputeBroadcastBackwardAxes(B_shape, output_shape, &B_axes, nullptr, NodeName());
result.push_back(
NodeDef("Transpose",
@ -473,7 +473,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetGemmGradient) {
std::vector<Dimension> C_shape, dY_shape;
if (GetShape(C, C_shape).IsOK() && GetShape(dY, dY_shape).IsOK()) {
std::vector<int64_t> C_axes, dY_axes;
ComputeBroadcastBackwardAxes(C_shape, dY_shape, &C_axes, &dY_axes);
ComputeBroadcastBackwardAxes(C_shape, dY_shape, &C_axes, &dY_axes, NodeName());
if (C_axes.size() > 0) {
HandleBroadcasting(dY, C, IA("dC_reduced"), C_axes, result);
@ -861,7 +861,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetAddSubGradient) {
std::vector<Dimension> a_shape, b_shape;
if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
std::vector<int64_t> a_axes, b_axes;
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
if (IsGradientRequiredForSrcNodeInput(0)) {
if (a_axes.size() > 0) {
HandleBroadcasting(GO(0), a, GI(0), a_axes, output);
@ -932,7 +932,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMulGradient) {
std::vector<Dimension> a_shape, b_shape;
if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
std::vector<int64_t> a_axes, b_axes;
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
if (IsGradientRequiredForSrcNodeInput(0)) {
output.push_back(
@ -1008,7 +1008,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetDivGradient) {
std::vector<Dimension> a_shape, b_shape;
if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
std::vector<int64_t> a_axes, b_axes;
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
ArgDef tmp_grad = IA("PreReduceGrad0", OType(0));
output.push_back(NodeDef("Div", {GO(0), I(1)}, {tmp_grad}));
@ -1066,7 +1066,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
result.push_back(NodeDef("Size", {I(0)}, {IA("Sized_X")}));
result.push_back(NodeDef("Size", {GO(0)}, {IA("Sized_Grad")}));
result.push_back(NodeDef("Div",{IA("Sized_X"), IA("Sized_Grad")}, {IA("Scale")}));
result.push_back(NodeDef("Div", {IA("Sized_X"), IA("Sized_Grad")}, {IA("Scale")}));
result.push_back(NodeDef(OpDef{"Scale", kMSDomain, 1},
{grad, IA("Scale")},
{IA("Scaled_Grad")},
@ -1234,16 +1234,17 @@ IMPLEMENT_GRADIENT_BUILDER(GetGeluGradient) {
namespace {
std::vector<NodeDef> GetBiasGeluGradNodes(
bool use_approximation,
const ArgDef& dY, const ArgDef& X, const ArgDef& B, // inputs
const ArgDef& dX, const ArgDef& dB, // outputs
const ArgDef& b_axes, const ArgDef& b_shape, const ArgDef& x_shape) { //intermediate args
const ArgDef& dY, const ArgDef& X, const ArgDef& B, // inputs
const ArgDef& dX, const ArgDef& dB, // outputs
const ArgDef& b_axes, const ArgDef& b_shape, const ArgDef& x_shape, //intermediate args
const std::string& node_name) {
std::vector<Dimension> B_shape, X_shape;
if (GetShape(B, B_shape).IsOK() && GetShape(X, X_shape).IsOK()) {
ORT_ENFORCE(B_shape.size() == 1, "B must have exactly one dimension.");
const std::vector<int64_t> B_axes = [&B_shape, &X_shape]() {
const std::vector<int64_t> B_axes = [&B_shape, &X_shape, &node_name]() {
std::vector<int64_t> result{};
ComputeBroadcastBackwardAxes(B_shape, X_shape, &result, nullptr);
ComputeBroadcastBackwardAxes(B_shape, X_shape, &result, nullptr, node_name);
return result;
}();
return std::vector<NodeDef>{
@ -1279,7 +1280,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetBiasGeluGradient) {
ArgDef b_axes = IA("ReduceAxes_" + B.name);
ArgDef b_shape = IA("Shape_" + B.name);
ArgDef x_shape = IA("Shape_" + X.name);
return GetBiasGeluGradNodes(false, dY, X, B, dX, dB, b_axes, b_shape, x_shape);
return GetBiasGeluGradNodes(false, dY, X, B, dX, dB, b_axes, b_shape, x_shape, NodeName());
}
IMPLEMENT_GRADIENT_BUILDER(GetFastGeluGradient) {
@ -1293,8 +1294,9 @@ IMPLEMENT_GRADIENT_BUILDER(GetFastGeluGradient) {
ArgDef b_axes = IA("ReduceAxes_" + B.name);
ArgDef b_shape = IA("Shape_" + B.name);
ArgDef x_shape = IA("Shape_" + X.name);
return GetBiasGeluGradNodes(true, dY, X, B, dX, dB, b_axes, b_shape, x_shape);
return GetBiasGeluGradNodes(true, dY, X, B, dX, dB, b_axes, b_shape, x_shape, NodeName());
}
if (num_src_node_inputs == 1) { // without bias
return std::vector<NodeDef>{
NodeDef(OpDef{"FastGeluGrad", kMSDomain, 1},
@ -1432,7 +1434,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetExpandGradient) {
std::vector<Dimension> a_shape, y_shape;
if (GetShape(a, a_shape).IsOK() && GetShape(y, y_shape).IsOK()) {
std::vector<int64_t> a_axes;
ComputeBroadcastBackwardAxes(a_shape, y_shape, &a_axes, nullptr);
ComputeBroadcastBackwardAxes(a_shape, y_shape, &a_axes, nullptr, NodeName());
if (a_axes.size() > 0) {
HandleBroadcasting(GO(0), a, GI(0), a_axes, output);
@ -1466,8 +1468,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetExpGradient) {
IMPLEMENT_GRADIENT_BUILDER(GetFlattenGradient) {
return std::vector<NodeDef>{
NodeDef("Shape", {I(0)}, {IA("input_shape")}),
NodeDef("Reshape", {GO(0), IA("input_shape")}, {GI(0)})
};
NodeDef("Reshape", {GO(0), IA("input_shape")}, {GI(0)})};
}
IMPLEMENT_GRADIENT_BUILDER(GetTopKGradient) {

View file

@ -9,11 +9,31 @@
namespace onnxruntime {
namespace training {
std::string ToString(const std::vector<Dimension>& dims) {
std::stringstream output;
output << "[";
if (!dims.empty()) {
for (auto& dim : dims) {
if (dim.has_dim_value()) {
output << dim.dim_value() << ",";
}
if (dim.has_dim_param()) {
output << dim.dim_param() << ",";
}
}
output.seekp(-1, output.cur);
}
output << "]";
return output.str();
}
void ComputeBroadcastBackwardAxes(
const std::vector<Dimension>& A_dims,
const std::vector<Dimension>& B_dims,
std::vector<int64_t>* A_axes,
std::vector<int64_t>* B_axes) {
std::vector<int64_t>* B_axes,
const std::string& node_name) {
if (A_axes) A_axes->clear();
if (B_axes) B_axes->clear();
@ -39,16 +59,16 @@ void ComputeBroadcastBackwardAxes(
auto A_dim = A_dims[i].dim_param(),
B_dim = B_dims[j].dim_param();
if (A_dim != B_dim) {
ORT_THROW("Error: symbolic dimension doesn't match. Expect the same symbolic but got \"",
A_dim, "\" and \"", B_dim, "\".");
ORT_THROW("Gradient building error for node ", node_name, ": symbolic dimension doesn't match. ",
"A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
}
} else if (A_dims[i].has_dim_param() && B_dims[j].has_dim_value()) {
auto A_dim = A_dims[i].dim_param();
auto B_dim = B_dims[j].dim_value();
if (B_dim != 1) {
ORT_THROW("Error: symbolic broadcasting requires the corresponding dimension to be 1. ",
"Actually got ", B_dim);
ORT_THROW("Gradient building error for node ", node_name, ": symbolic broadcasting requires the B_dimension to be 1. ",
"A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
}
if (B_axes) {
B_axes->push_back(gsl::narrow_cast<int64_t>(k));
@ -58,8 +78,8 @@ void ComputeBroadcastBackwardAxes(
auto B_dim = B_dims[i].dim_param();
if (A_dim != 1) {
ORT_THROW("Error: symbolic broadcasting requires the corresponding dimension to be 1. ",
"Actually got ", A_dim);
ORT_THROW("Gradient building error for node ", node_name, ": symbolic broadcasting requires the A_dimension to be 1. ",
"A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
}
if (A_axes) {
A_axes->push_back(gsl::narrow_cast<int64_t>(k));

View file

@ -22,7 +22,8 @@ void ComputeBroadcastBackwardAxes(
const std::vector<Dimension>& A_dims,
const std::vector<Dimension>& B_dims,
std::vector<int64_t>* A_axes,
std::vector<int64_t>* B_axes);
std::vector<int64_t>* B_axes,
const std::string& node_name = "");
void ComputeBroadcastBackwardAxesDynamic(const ArgDef& a,
const ArgDef& b,
@ -211,11 +212,11 @@ class GradientBuilderBase {
if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
return ConstantScalarNode(MLFloat16(math::floatToHalf(value)), {1}, arg_name);
}
if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) {
return ConstantScalarNode(BFloat16(value), {1}, arg_name);
}
return ConstantScalarNode(value, {1}, arg_name);
}
@ -244,6 +245,8 @@ class GradientBuilderBase {
const ArgDef& reduce_axes,
std::vector<NodeDef>& output) const;
const std::string& NodeName() const { return node_->Name(); }
private:
friend class GradientGraphBuilder;