mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
Add more loggings for GradientBuilder (#5556)
* Add more loggings for GradientBuilder Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
This commit is contained in:
parent
68fe722691
commit
694a4d6413
3 changed files with 51 additions and 27 deletions
|
|
@ -246,7 +246,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMatMulGradient) {
|
|||
output_shape.push_back(B_shape[B_shape.size() - 2]);
|
||||
|
||||
std::vector<int64_t> A_axes;
|
||||
ComputeBroadcastBackwardAxes(A_shape, output_shape, &A_axes, nullptr);
|
||||
ComputeBroadcastBackwardAxes(A_shape, output_shape, &A_axes, nullptr, NodeName());
|
||||
|
||||
result.push_back(
|
||||
NodeDef("Transpose",
|
||||
|
|
@ -299,7 +299,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMatMulGradient) {
|
|||
output_shape.push_back(Y_shape[Y_shape.size() - 1]);
|
||||
|
||||
std::vector<int64_t> B_axes;
|
||||
ComputeBroadcastBackwardAxes(B_shape, output_shape, &B_axes, nullptr);
|
||||
ComputeBroadcastBackwardAxes(B_shape, output_shape, &B_axes, nullptr, NodeName());
|
||||
|
||||
result.push_back(
|
||||
NodeDef("Transpose",
|
||||
|
|
@ -473,7 +473,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetGemmGradient) {
|
|||
std::vector<Dimension> C_shape, dY_shape;
|
||||
if (GetShape(C, C_shape).IsOK() && GetShape(dY, dY_shape).IsOK()) {
|
||||
std::vector<int64_t> C_axes, dY_axes;
|
||||
ComputeBroadcastBackwardAxes(C_shape, dY_shape, &C_axes, &dY_axes);
|
||||
ComputeBroadcastBackwardAxes(C_shape, dY_shape, &C_axes, &dY_axes, NodeName());
|
||||
|
||||
if (C_axes.size() > 0) {
|
||||
HandleBroadcasting(dY, C, IA("dC_reduced"), C_axes, result);
|
||||
|
|
@ -861,7 +861,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetAddSubGradient) {
|
|||
std::vector<Dimension> a_shape, b_shape;
|
||||
if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
|
||||
std::vector<int64_t> a_axes, b_axes;
|
||||
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
|
||||
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
|
||||
if (IsGradientRequiredForSrcNodeInput(0)) {
|
||||
if (a_axes.size() > 0) {
|
||||
HandleBroadcasting(GO(0), a, GI(0), a_axes, output);
|
||||
|
|
@ -932,7 +932,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetMulGradient) {
|
|||
std::vector<Dimension> a_shape, b_shape;
|
||||
if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
|
||||
std::vector<int64_t> a_axes, b_axes;
|
||||
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
|
||||
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
|
||||
|
||||
if (IsGradientRequiredForSrcNodeInput(0)) {
|
||||
output.push_back(
|
||||
|
|
@ -1008,7 +1008,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetDivGradient) {
|
|||
std::vector<Dimension> a_shape, b_shape;
|
||||
if (GetShape(a, a_shape).IsOK() && GetShape(b, b_shape).IsOK()) {
|
||||
std::vector<int64_t> a_axes, b_axes;
|
||||
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes);
|
||||
ComputeBroadcastBackwardAxes(a_shape, b_shape, &a_axes, &b_axes, NodeName());
|
||||
|
||||
ArgDef tmp_grad = IA("PreReduceGrad0", OType(0));
|
||||
output.push_back(NodeDef("Div", {GO(0), I(1)}, {tmp_grad}));
|
||||
|
|
@ -1066,7 +1066,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
|
|||
|
||||
result.push_back(NodeDef("Size", {I(0)}, {IA("Sized_X")}));
|
||||
result.push_back(NodeDef("Size", {GO(0)}, {IA("Sized_Grad")}));
|
||||
result.push_back(NodeDef("Div",{IA("Sized_X"), IA("Sized_Grad")}, {IA("Scale")}));
|
||||
result.push_back(NodeDef("Div", {IA("Sized_X"), IA("Sized_Grad")}, {IA("Scale")}));
|
||||
result.push_back(NodeDef(OpDef{"Scale", kMSDomain, 1},
|
||||
{grad, IA("Scale")},
|
||||
{IA("Scaled_Grad")},
|
||||
|
|
@ -1234,16 +1234,17 @@ IMPLEMENT_GRADIENT_BUILDER(GetGeluGradient) {
|
|||
namespace {
|
||||
std::vector<NodeDef> GetBiasGeluGradNodes(
|
||||
bool use_approximation,
|
||||
const ArgDef& dY, const ArgDef& X, const ArgDef& B, // inputs
|
||||
const ArgDef& dX, const ArgDef& dB, // outputs
|
||||
const ArgDef& b_axes, const ArgDef& b_shape, const ArgDef& x_shape) { //intermediate args
|
||||
const ArgDef& dY, const ArgDef& X, const ArgDef& B, // inputs
|
||||
const ArgDef& dX, const ArgDef& dB, // outputs
|
||||
const ArgDef& b_axes, const ArgDef& b_shape, const ArgDef& x_shape, //intermediate args
|
||||
const std::string& node_name) {
|
||||
std::vector<Dimension> B_shape, X_shape;
|
||||
if (GetShape(B, B_shape).IsOK() && GetShape(X, X_shape).IsOK()) {
|
||||
ORT_ENFORCE(B_shape.size() == 1, "B must have exactly one dimension.");
|
||||
|
||||
const std::vector<int64_t> B_axes = [&B_shape, &X_shape]() {
|
||||
const std::vector<int64_t> B_axes = [&B_shape, &X_shape, &node_name]() {
|
||||
std::vector<int64_t> result{};
|
||||
ComputeBroadcastBackwardAxes(B_shape, X_shape, &result, nullptr);
|
||||
ComputeBroadcastBackwardAxes(B_shape, X_shape, &result, nullptr, node_name);
|
||||
return result;
|
||||
}();
|
||||
return std::vector<NodeDef>{
|
||||
|
|
@ -1279,7 +1280,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetBiasGeluGradient) {
|
|||
ArgDef b_axes = IA("ReduceAxes_" + B.name);
|
||||
ArgDef b_shape = IA("Shape_" + B.name);
|
||||
ArgDef x_shape = IA("Shape_" + X.name);
|
||||
return GetBiasGeluGradNodes(false, dY, X, B, dX, dB, b_axes, b_shape, x_shape);
|
||||
return GetBiasGeluGradNodes(false, dY, X, B, dX, dB, b_axes, b_shape, x_shape, NodeName());
|
||||
}
|
||||
|
||||
IMPLEMENT_GRADIENT_BUILDER(GetFastGeluGradient) {
|
||||
|
|
@ -1293,8 +1294,9 @@ IMPLEMENT_GRADIENT_BUILDER(GetFastGeluGradient) {
|
|||
ArgDef b_axes = IA("ReduceAxes_" + B.name);
|
||||
ArgDef b_shape = IA("Shape_" + B.name);
|
||||
ArgDef x_shape = IA("Shape_" + X.name);
|
||||
return GetBiasGeluGradNodes(true, dY, X, B, dX, dB, b_axes, b_shape, x_shape);
|
||||
return GetBiasGeluGradNodes(true, dY, X, B, dX, dB, b_axes, b_shape, x_shape, NodeName());
|
||||
}
|
||||
|
||||
if (num_src_node_inputs == 1) { // without bias
|
||||
return std::vector<NodeDef>{
|
||||
NodeDef(OpDef{"FastGeluGrad", kMSDomain, 1},
|
||||
|
|
@ -1432,7 +1434,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetExpandGradient) {
|
|||
std::vector<Dimension> a_shape, y_shape;
|
||||
if (GetShape(a, a_shape).IsOK() && GetShape(y, y_shape).IsOK()) {
|
||||
std::vector<int64_t> a_axes;
|
||||
ComputeBroadcastBackwardAxes(a_shape, y_shape, &a_axes, nullptr);
|
||||
ComputeBroadcastBackwardAxes(a_shape, y_shape, &a_axes, nullptr, NodeName());
|
||||
|
||||
if (a_axes.size() > 0) {
|
||||
HandleBroadcasting(GO(0), a, GI(0), a_axes, output);
|
||||
|
|
@ -1466,8 +1468,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetExpGradient) {
|
|||
IMPLEMENT_GRADIENT_BUILDER(GetFlattenGradient) {
|
||||
return std::vector<NodeDef>{
|
||||
NodeDef("Shape", {I(0)}, {IA("input_shape")}),
|
||||
NodeDef("Reshape", {GO(0), IA("input_shape")}, {GI(0)})
|
||||
};
|
||||
NodeDef("Reshape", {GO(0), IA("input_shape")}, {GI(0)})};
|
||||
}
|
||||
|
||||
IMPLEMENT_GRADIENT_BUILDER(GetTopKGradient) {
|
||||
|
|
|
|||
|
|
@ -9,11 +9,31 @@
|
|||
namespace onnxruntime {
|
||||
namespace training {
|
||||
|
||||
std::string ToString(const std::vector<Dimension>& dims) {
|
||||
std::stringstream output;
|
||||
output << "[";
|
||||
if (!dims.empty()) {
|
||||
for (auto& dim : dims) {
|
||||
if (dim.has_dim_value()) {
|
||||
output << dim.dim_value() << ",";
|
||||
}
|
||||
if (dim.has_dim_param()) {
|
||||
output << dim.dim_param() << ",";
|
||||
}
|
||||
}
|
||||
output.seekp(-1, output.cur);
|
||||
}
|
||||
output << "]";
|
||||
|
||||
return output.str();
|
||||
}
|
||||
|
||||
void ComputeBroadcastBackwardAxes(
|
||||
const std::vector<Dimension>& A_dims,
|
||||
const std::vector<Dimension>& B_dims,
|
||||
std::vector<int64_t>* A_axes,
|
||||
std::vector<int64_t>* B_axes) {
|
||||
std::vector<int64_t>* B_axes,
|
||||
const std::string& node_name) {
|
||||
if (A_axes) A_axes->clear();
|
||||
if (B_axes) B_axes->clear();
|
||||
|
||||
|
|
@ -39,16 +59,16 @@ void ComputeBroadcastBackwardAxes(
|
|||
auto A_dim = A_dims[i].dim_param(),
|
||||
B_dim = B_dims[j].dim_param();
|
||||
if (A_dim != B_dim) {
|
||||
ORT_THROW("Error: symbolic dimension doesn't match. Expect the same symbolic but got \"",
|
||||
A_dim, "\" and \"", B_dim, "\".");
|
||||
ORT_THROW("Gradient building error for node ", node_name, ": symbolic dimension doesn't match. ",
|
||||
"A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
|
||||
}
|
||||
} else if (A_dims[i].has_dim_param() && B_dims[j].has_dim_value()) {
|
||||
auto A_dim = A_dims[i].dim_param();
|
||||
auto B_dim = B_dims[j].dim_value();
|
||||
|
||||
if (B_dim != 1) {
|
||||
ORT_THROW("Error: symbolic broadcasting requires the corresponding dimension to be 1. ",
|
||||
"Actually got ", B_dim);
|
||||
ORT_THROW("Gradient building error for node ", node_name, ": symbolic broadcasting requires the B_dimension to be 1. ",
|
||||
"A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
|
||||
}
|
||||
if (B_axes) {
|
||||
B_axes->push_back(gsl::narrow_cast<int64_t>(k));
|
||||
|
|
@ -58,8 +78,8 @@ void ComputeBroadcastBackwardAxes(
|
|||
auto B_dim = B_dims[i].dim_param();
|
||||
|
||||
if (A_dim != 1) {
|
||||
ORT_THROW("Error: symbolic broadcasting requires the corresponding dimension to be 1. ",
|
||||
"Actually got ", A_dim);
|
||||
ORT_THROW("Gradient building error for node ", node_name, ": symbolic broadcasting requires the A_dimension to be 1. ",
|
||||
"A_dims:", ToString(A_dims), ", B_dims:", ToString(B_dims));
|
||||
}
|
||||
if (A_axes) {
|
||||
A_axes->push_back(gsl::narrow_cast<int64_t>(k));
|
||||
|
|
|
|||
|
|
@ -22,7 +22,8 @@ void ComputeBroadcastBackwardAxes(
|
|||
const std::vector<Dimension>& A_dims,
|
||||
const std::vector<Dimension>& B_dims,
|
||||
std::vector<int64_t>* A_axes,
|
||||
std::vector<int64_t>* B_axes);
|
||||
std::vector<int64_t>* B_axes,
|
||||
const std::string& node_name = "");
|
||||
|
||||
void ComputeBroadcastBackwardAxesDynamic(const ArgDef& a,
|
||||
const ArgDef& b,
|
||||
|
|
@ -211,11 +212,11 @@ class GradientBuilderBase {
|
|||
if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
|
||||
return ConstantScalarNode(MLFloat16(math::floatToHalf(value)), {1}, arg_name);
|
||||
}
|
||||
|
||||
|
||||
if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) {
|
||||
return ConstantScalarNode(BFloat16(value), {1}, arg_name);
|
||||
}
|
||||
|
||||
|
||||
return ConstantScalarNode(value, {1}, arg_name);
|
||||
}
|
||||
|
||||
|
|
@ -244,6 +245,8 @@ class GradientBuilderBase {
|
|||
const ArgDef& reduce_axes,
|
||||
std::vector<NodeDef>& output) const;
|
||||
|
||||
const std::string& NodeName() const { return node_->Name(); }
|
||||
|
||||
private:
|
||||
friend class GradientGraphBuilder;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue