Speed Up GradientChecker Running (#11579)

* fix gradient tester

* test size adjust

* fix win build
This commit is contained in:
Vincent Wang 2022-05-27 15:14:53 +08:00 committed by GitHub
parent 6a45f9f059
commit eadb1a3128
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 739 additions and 917 deletions

View file

@ -15,18 +15,38 @@ limitations under the License.
/* Modifications Copyright (c) Microsoft. */
#include "gradient_checker.h"
#include "gradient_op_test_utils.h"
#include "orttraining/test/gradient/gradient_checker.h"
#include <random>
#include "orttraining/test/gradient/gradient_op_test_utils.h"
#include "orttraining/core/framework/gradient_graph_builder.h"
#include "orttraining/core/graph/gradient_config.h"
#include "test/util/include/test_random_seed.h"
#include <random>
#include "test/util/include/default_providers.h"
namespace onnxruntime {
namespace test {
using ONNX_NAMESPACE::AttributeProto;
using training::OpDef;
namespace {
std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(bool cpu_only = false) {
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultCpuExecutionProvider());
if (cpu_only) return execution_providers;
#ifdef USE_CUDA
execution_providers.push_back(DefaultCudaExecutionProvider());
#endif
#ifdef USE_ROCM
execution_providers.push_back(DefaultRocmExecutionProvider());
#endif
return execution_providers;
}
}; // namespace
// The jacobian transpose matrix is laid out as follows
// Say there are three inputs each of size M X N, N X K, K X J
@ -37,131 +57,118 @@ using training::OpDef;
// | N X K | | |
// | K X J | | |
// V
std::pair<int, int> inline CalculateJacobianTransposeIndex(const std::vector<TensorInfo>& x_infos,
int x_input_index,
int x_flattened_index,
const std::vector<TensorInfo>& y_infos,
int y_output_index,
int y_flattened_index) {
int64_t elems_in_prev_output_tensors = 0;
for (int i = 0; i < y_output_index; i++) {
elems_in_prev_output_tensors += y_infos[i].shape.Size();
// The Jacobian is always a real-valued matrix.
// Given y = f(x) for tensors y and x, it contains the derivatives dy_i/dx_j for
// every pair y_i in y and x_j in x. Note that the Jacobian is defined directly
// over the elements of tensors y and x, and doesn't depend on their shapes.
//
// If x = (x_1, x_2, ..., x_m) and y = (y_1, y_2, .., y_n) the matrix evaluated
// is actually the Jacobian transpose, defined as this mxn matrix:
// dy_1/d_x1 dy_2/dx_1 ... dy_n/dx_1
// dy_1/dx_2 dy_2/dx_2 ... dy_n/dx_2
// .
// .
// .
// dy_1/dx_m dy_2/dx_m ... dy_n/dx_m
template <typename X_T, typename Y_T, typename JAC_T>
inline void GradientChecker<X_T, Y_T, JAC_T>::InitJacobians(size_t row_count, size_t col_count,
std::vector<std::vector<JAC_T>>* jacobians) {
// the number of rows is equal to total number of scalar input values in all of input vectors
jacobians->resize(row_count);
// the number of cols is equal to total number of scalar output values in all of output vectors
for (size_t i = 0; i < row_count; ++i) {
(*jacobians)[i] = std::vector<JAC_T>(col_count, 0);
}
int64_t col = elems_in_prev_output_tensors + y_flattened_index;
int64_t elems_in_prev_input_tensors = 0;
for (int i = 0; i < x_input_index; i++) {
elems_in_prev_input_tensors += x_infos[i].shape.Size();
}
int64_t row = elems_in_prev_input_tensors + x_flattened_index;
return {gsl::narrow_cast<int>(row), gsl::narrow_cast<int>(col)};
}
template <typename X_T, typename Y_T, typename JAC_T>
inline std::vector<OrtValue> GradientChecker<X_T, Y_T, JAC_T>::EvaluateFunctionAtInput(
OpTester& op_session,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas) {
// clear OpTester input/output/initializer_index
op_session.ClearData();
OpTester& op_session, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas) {
AddDatas(op_session, x_infos, y_infos, x_datas, y_datas);
for (size_t data_index = 0; data_index < x_datas->size(); data_index++) {
// If EPs is not set, the OpTester will run over all possible EPs and keep the outputs of last run as the
// actual output data, which is time wasting. What we need is the forward graph outputs for numeric Jacobian,
// using CPU EP only is enough.
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers = GetExecutionProviders(true);
op_session.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
return op_session.GetFetches();
}
template <typename X_T, typename Y_T, typename JAC_T>
inline void GradientChecker<X_T, Y_T, JAC_T>::AddDatas(OpTester& op_session, const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas) {
op_session.ClearData();
for (size_t data_index = 0; data_index < x_datas->size(); ++data_index) {
std::string name = "input" + std::to_string(data_index);
const std::vector<X_T>& data = (*x_datas)[data_index];
if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
std::vector<int64_t> int64_data(data.size());
std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast<int64_t>(x); });
op_session.AddInput<int64_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int64_data);
op_session.AddInput<int64_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int64_data, false,
&x_infos[data_index].dim_params);
} else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int32_t>()) {
std::vector<int32_t> int32_data(data.size());
std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast<int32_t>(x); });
op_session.AddInput<int32_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int32_data);
op_session.AddInput<int32_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int32_data, false,
&x_infos[data_index].dim_params);
} else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<bool>()) {
std::unique_ptr<bool[]> p_data(new bool[data.size()]);
for (size_t i = 0; i < data.size(); ++i) {
p_data[i] = static_cast<bool>(data[i]);
}
op_session.AddInput<bool>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), p_data.get(), data.size());
op_session.AddInput<bool>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), p_data.get(), data.size(),
false, &x_infos[data_index].dim_params);
} else {
op_session.AddInput<X_T>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), data);
op_session.AddInput<X_T>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), data, false,
&x_infos[data_index].dim_params);
}
}
for (size_t data_index = 0; data_index < y_infos.size(); data_index++) {
for (size_t data_index = 0; data_index < y_infos.size(); ++data_index) {
std::string name = "output" + std::to_string(data_index);
op_session.AddOutput<Y_T>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), (*y_datas)[data_index]);
const std::vector<Y_T>& data = (*y_datas)[data_index];
if (y_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
std::vector<int64_t> int64_data(data.size());
std::transform(data.begin(), data.end(), int64_data.begin(), [](Y_T x) { return static_cast<int64_t>(x); });
op_session.AddOutput<int64_t>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), int64_data);
} else {
op_session.AddOutput<Y_T>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), data);
}
}
op_session.Run();
return op_session.GetFetches();
}
template <typename X_T, typename Y_T, typename JAC_T>
inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeTheoreticalJacobianTranspose(
const OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
std::vector<std::vector<JAC_T>>* jacobian_ts,
const std::vector<AttributeProto>& attributes,
bool add_shape,
const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
std::vector<std::vector<JAC_T>>* jacobian_ts, const std::vector<size_t>& row_strides,
const std::vector<size_t>& col_strides, const std::vector<AttributeProto>& attributes, bool add_shape,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers /* nullptr*/) {
size_t y_num = y_infos.size();
size_t x_num = x_infos.size();
// build the graph once and reuse it later in the looping logic
GradientOpTester op_session(op_def.type.c_str(), x_infos, y_infos, op_def.opset_version, op_def.domain.c_str(), false);
GradientOpTester op_session(op_def.type.c_str(), x_infos, y_infos, op_def.opset_version, op_def.domain.c_str(),
false);
op_session.AddShapeToTensorData(add_shape);
ORT_RETURN_IF_ERROR(InitOpTesterWithGradGraph(op_session, x_infos, y_infos, x_datas, y_datas, attributes));
// currently only supported scalar valued fns - and complex types are not supported
for (int y_idx = 0; y_idx < static_cast<int>(y_num); y_idx++) { // for each dy input
for (size_t y_idx = 0; y_idx < y_num; y_idx++) { // for each dy input
if (!y_infos[y_idx].has_gradient) {
continue;
}
const size_t dy_size = y_infos[y_idx].shape.Size();
const size_t dy_size = static_cast<size_t>(y_infos[y_idx].shape.Size());
// Compute the theoretical Jacobians one row at a time by back propagating
// '1.0' for each element of 'dy', while holding all other elements of 'dy' at zero.
for (size_t c = 0; c < dy_size; ++c) { // for each value in the dy input vector
// clear OpTester input/output/initializer
op_session.ClearData();
for (size_t data_index = 0; data_index < x_num; data_index++) {
std::string name = "input" + std::to_string(data_index);
const std::vector<X_T>& data = (*x_datas)[data_index];
if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
std::vector<int64_t> int64_data(data.size());
std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast<int64_t>(x); });
op_session.AddInput<int64_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int64_data);
} else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int32_t>()) {
std::vector<int32_t> int32_data(data.size());
std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast<int32_t>(x); });
op_session.AddInput<int32_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int32_data);
} else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<bool>()) {
std::unique_ptr<bool[]> p_data(new bool[data.size()]);
for (size_t i = 0; i < data.size(); ++i) {
p_data[i] = static_cast<bool>(data[i]);
}
op_session.AddInput<bool>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), p_data.get(), data.size());
} else {
op_session.AddInput<X_T>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), data);
}
}
for (size_t data_index = 0; data_index < y_num; data_index++) {
std::string name = "output" + std::to_string(data_index);
op_session.AddOutput<Y_T>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), (*y_datas)[data_index]);
}
AddDatas(op_session, x_infos, y_infos, x_datas, y_datas);
// While calculating theoritical jacobian transpose we calculate the gradient by
// setting back propogating one element of dY at a time and setting everything else to zero
@ -169,98 +176,45 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeTheoreticalJacobianTransp
// inputs is treated as a vector of vectors. The parameters of the function call below, y_idx and c
// corresponding to which input (dy1, dy2..etc) and which value of the input (dy_flattened_vector[c]]
// to pertrub to 1.
op_session.Run(y_idx, static_cast<int>(c), OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, execution_providers);
if (execution_providers) {
op_session.Run(static_cast<int>(y_idx), static_cast<int>(c), OpTester::ExpectResult::kExpectSuccess, "", {},
nullptr, execution_providers);
} else {
// If EPs is not set, the OpTester will run over all possible EPs and keep the outputs of last run as the
// actual output data, which is time wasting. So if caller doesn't pass in the EPs, we will use the default
// EPs according to the environment.
std::vector<std::unique_ptr<IExecutionProvider>> default_eps = GetExecutionProviders();
op_session.Run(static_cast<int>(y_idx), static_cast<int>(c), OpTester::ExpectResult::kExpectSuccess, "", {},
nullptr, &default_eps);
}
auto gradients = op_session.GetFetches();
for (int x_idx = 0, grad_idx = 0; x_idx < static_cast<int>(x_num); x_idx++) {
for (size_t x_idx = 0, grad_idx = 0; x_idx < x_num; x_idx++) {
if (!x_infos[x_idx].has_gradient) {
continue;
}
const int64_t x_size = x_infos[x_idx].shape.Size();
const size_t x_size = static_cast<size_t>(x_infos[x_idx].shape.Size());
auto dx_flat = gradients[grad_idx].Get<Tensor>().Data<X_T>();
grad_idx++;
for (int r = 0; r < static_cast<int>(x_size); ++r) {
auto calc_index = CalculateJacobianTransposeIndex(
x_infos,
x_idx,
r,
y_infos,
y_idx,
static_cast<int>(c));
(*jacobian_ts)[calc_index.first][calc_index.second] = dx_flat[r];
for (size_t r = 0; r < x_size; ++r) {
(*jacobian_ts)[row_strides[x_idx] + r][col_strides[y_idx] + c] = dx_flat[r];
}
}
}
}
return Status::OK();
}
template <typename X_T, typename Y_T, typename JAC_T>
inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGraph(
OpTester& op_session,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
OpTester& op_session, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
const std::vector<AttributeProto>& attributes,
const std::unordered_map<std::string, int>& extra_domain_to_version) {
for (size_t data_index = 0; data_index < x_datas->size(); data_index++) {
std::string name = "input" + std::to_string(data_index);
const std::vector<X_T>& data = (*x_datas)[data_index];
if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
std::vector<int64_t> int64_data(data.size());
std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast<int64_t>(x); });
op_session.AddInput<int64_t>(name.c_str(),
x_infos[data_index].shape.AsShapeVector(),
int64_data,
false,
&x_infos[data_index].dim_params);
} else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int32_t>()) {
std::vector<int32_t> int32_data(data.size());
std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast<int32_t>(x); });
op_session.AddInput<int32_t>(name.c_str(),
x_infos[data_index].shape.AsShapeVector(),
int32_data,
false,
&x_infos[data_index].dim_params);
} else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<bool>()) {
std::unique_ptr<bool[]> p_data(new bool[data.size()]);
for (size_t i = 0; i < data.size(); ++i) {
p_data[i] = static_cast<bool>(data[i]);
}
op_session.AddInput<bool>(name.c_str(),
x_infos[data_index].shape.AsShapeVector(),
p_data.get(),
data.size(),
false,
&x_infos[data_index].dim_params);
} else {
op_session.AddInput<X_T>(name.c_str(),
x_infos[data_index].shape.AsShapeVector(),
data,
false,
&x_infos[data_index].dim_params);
}
}
for (size_t data_index = 0; data_index < y_infos.size(); data_index++) {
std::string name = "output" + std::to_string(data_index);
const std::vector<Y_T>& data = (*y_datas)[data_index];
if (y_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
std::vector<int64_t> int64_data(data.size());
std::transform(data.begin(), data.end(), int64_data.begin(), [](Y_T x) { return static_cast<int64_t>(x); });
op_session.AddOutput<int64_t>(name.c_str(),
y_infos[data_index].shape.AsShapeVector(),
int64_data);
} else {
op_session.AddOutput<Y_T>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), data);
}
}
AddDatas(op_session, x_infos, y_infos, x_datas, y_datas);
// Currently only allows setting int attributes to zero. TODO: Expand this
for (auto attr : attributes) {
op_session.AddAttributeProto(attr);
@ -291,15 +245,12 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGraph(
template <typename X_T, typename Y_T, typename JAC_T>
inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGradGraph(
OpTester& op_session,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
OpTester& op_session, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
const std::vector<AttributeProto>& attributes) {
std::unordered_map<std::string, int> extra_domain_to_version{{kMSDomain, 1}, {kOnnxDomain, 9}};
ORT_RETURN_IF_ERROR(InitOpTesterWithGraph(op_session, x_infos, y_infos, x_datas, y_datas, attributes,
extra_domain_to_version));
ORT_RETURN_IF_ERROR(
InitOpTesterWithGraph(op_session, x_infos, y_infos, x_datas, y_datas, attributes, extra_domain_to_version));
// build grad graph
auto p_model = op_session.GetModelCache();
auto& graph = p_model->MainGraph();
@ -320,11 +271,7 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGradGraph(
training::GradientGraphConfiguration gradient_graph_config;
gradient_graph_config.set_gradients_as_graph_outputs = true;
training::GradientGraphBuilder grad_graph_builder(&graph,
dy_values,
weights_to_train,
"",
gradient_graph_config,
training::GradientGraphBuilder grad_graph_builder(&graph, dy_values, weights_to_train, "", gradient_graph_config,
logging::LoggingManager::DefaultLogger());
Status status = grad_graph_builder.Build();
EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
@ -334,15 +281,10 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGradGraph(
template <typename X_T, typename Y_T, typename JAC_T>
inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeNumericJacobianTranspose(
const OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
const JAC_T delta,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
std::vector<std::vector<JAC_T>>* jacobian_ts,
const std::vector<AttributeProto>& attributes,
bool add_shape) {
const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
const JAC_T delta, std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
std::vector<std::vector<JAC_T>>* jacobian_ts, const std::vector<size_t>& row_strides,
const std::vector<size_t>& col_strides, const std::vector<AttributeProto>& attributes, bool add_shape) {
size_t y_num = y_infos.size();
size_t x_num = x_infos.size();
X_T x_delta = static_cast<X_T>(delta);
@ -352,17 +294,17 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeNumericJacobianTranspose(
op_session.AddShapeToTensorData(add_shape);
ORT_RETURN_IF_ERROR(InitOpTesterWithGraph(op_session, x_infos, y_infos, x_datas, y_datas, attributes));
for (int x_idx = 0; x_idx < static_cast<int>(x_num); x_idx++) {
for (size_t x_idx = 0; x_idx < x_num; ++x_idx) {
if (!x_infos[x_idx].has_gradient) {
continue;
}
const int64_t x_size = x_infos[x_idx].shape.Size();
const size_t x_size = static_cast<size_t>(x_infos[x_idx].shape.Size());
// Compute the numeric Jacobian one column at a time by perturbing each
// element of 'x_data' (positively and negatively) by 'delta', and
// updating the jacobian with the centered difference
for (int r = 0; r < x_size; ++r) {
for (size_t r = 0; r < x_size; ++r) {
// Store current value of 'x' at 'r'.
X_T v = (*x_datas)[x_idx][r];
@ -374,89 +316,55 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeNumericJacobianTranspose(
(*x_datas)[x_idx][r] = v - x_delta;
std::vector<OrtValue> y_minus = EvaluateFunctionAtInput(op_session, x_infos, y_infos, x_datas, y_datas);
for (int y_idx = 0; y_idx < static_cast<int>(y_num); y_idx++) {
for (size_t y_idx = 0; y_idx < y_num; ++y_idx) {
if (!y_infos[y_idx].has_gradient) {
continue;
}
// Compute element-wise centered difference and store in each Jacobian.
auto y_plus_flat = y_plus[y_idx].Get<Tensor>().Data<Y_T>();
auto y_minus_flat = y_minus[y_idx].Get<Tensor>().Data<Y_T>();
const int64_t y_size = y_infos[y_idx].shape.Size();
const size_t y_size = static_cast<size_t>(y_infos[y_idx].shape.Size());
const Y_T scale = static_cast<Y_T>(2 * delta);
for (int c = 0; c < y_size; ++c) {
auto calc_index = CalculateJacobianTransposeIndex(
x_infos,
x_idx,
r,
y_infos,
y_idx,
c);
(*jacobian_ts)[calc_index.first][calc_index.second] = (y_plus_flat[c] - y_minus_flat[c]) / scale;
for (size_t c = 0; c < y_size; ++c) {
(*jacobian_ts)[row_strides[x_idx] + r][col_strides[y_idx] + c] = (y_plus_flat[c] - y_minus_flat[c]) / scale;
}
}
// Restore pre-perturbation value.
(*x_datas)[x_idx][r] = v;
}
}
return Status::OK();
}
//// The Jacobian is always a real-valued matrix.
//// Given y = f(x) for tensors y and x, it contains the derivatives dy_i/dx_j for
//// every pair y_i in y and x_j in x. Note that the Jacobian is defined directly
//// over the elements of tensors y and x, and doesn't depend on their shapes.
////
//// If x = (x_1, x_2, ..., x_m) and y = (y_1, y_2, .., y_n) the matrix evaluated
//// is actually the Jacobian transpose, defined as this mxn matrix:
//// dy_1/d_x1 dy_2/dx_1 ... dy_n/dx_1
//// dy_1/dx_2 dy_2/dx_2 ... dy_n/dx_2
//// .
//// .
//// .
//// dy_1/dx_m dy_2/dx_m ... dy_n/dx_m
template <typename X_T, typename Y_T, typename JAC_T>
inline Status GradientChecker<X_T, Y_T, JAC_T>::InitJacobians(
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<JAC_T>>* jacobians) {
// the number of rows is equal to total number of scalar input values in all of input vectors
int64_t rows = 0;
for (size_t i = 0; i < x_infos.size(); i++) {
rows += x_infos[i].shape.Size(); // 'S'ize gives the total number of elements in all dims while 's'ize just gives num_dims
}
jacobians->resize(gsl::narrow_cast<int>(rows));
// the number of cols is equal to total number of scalar output values in all of output vectors
int64_t cols = 0;
for (size_t i = 0; i < y_infos.size(); i++) {
cols += y_infos[i].shape.Size();
}
for (size_t i = 0; i < jacobians->size(); i++) {
(*jacobians)[i] = std::vector<JAC_T>(gsl::narrow_cast<int>(cols), 0);
}
return Status().OK();
}
template <typename X_T, typename Y_T, typename JAC_T>
inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientErrorInternal(
const OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
JAC_T* max_error,
const std::vector<AttributeProto>& attributes,
bool check_not_have_gradient,
bool check_not_have_shape_inferencing,
const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas, JAC_T* max_error,
const std::vector<AttributeProto>& attributes, bool check_not_have_gradient, bool check_not_have_shape_inferencing,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers /* nullptr */) {
std::vector<size_t> row_strides(x_infos.size());
std::vector<size_t> col_strides(y_infos.size());
size_t row_count = 0;
for (size_t i = 0; i < x_infos.size(); ++i) {
row_strides[i] = row_count;
row_count += static_cast<size_t>(x_infos[i].shape.Size());
}
size_t col_count = 0;
for (size_t i = 0; i < y_infos.size(); ++i) {
col_strides[i] = col_count;
col_count += static_cast<size_t>(y_infos[i].shape.Size());
}
// Initialize numeric Jacobian to zeros.
std::vector<std::vector<JAC_T>> jacobian_ns;
ORT_RETURN_IF_ERROR(InitJacobians(x_infos, y_infos, &jacobian_ns));
InitJacobians(row_count, col_count, &jacobian_ns);
// Compute numeric Jacobian.
ORT_RETURN_IF_ERROR(ComputeNumericJacobianTranspose(
op_def, x_infos, y_infos, JAC_T{1e-3f}, x_datas, y_datas, &jacobian_ns, attributes));
ORT_RETURN_IF_ERROR(ComputeNumericJacobianTranspose(op_def, x_infos, y_infos, JAC_T{1e-3f}, x_datas, y_datas,
&jacobian_ns, row_strides, col_strides, attributes));
// Compute the maximum error between theoretical and numeric Jacobians.
*max_error = 0.0;
@ -471,37 +379,42 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientErrorInternal(
for (size_t x_gradient_variation = 0; x_gradient_variation < total_gradient_variations; x_gradient_variation++) {
// Initialize theoretical Jacobians to zeros.
std::vector<std::vector<JAC_T>> jacobian_ts;
ORT_RETURN_IF_ERROR(InitJacobians(x_infos, y_infos, &jacobian_ts));
InitJacobians(row_count, col_count, &jacobian_ts);
std::vector<TensorInfo> x_infos_gradient_variation = x_infos;
if (check_not_have_gradient && x_gradient_variation < x_infos.size())
if (check_not_have_gradient && x_gradient_variation < x_infos.size()) {
x_infos_gradient_variation[x_gradient_variation].has_gradient = false;
}
// a gradient node cannot get created without any has_gradient node.
if (std::all_of(x_infos_gradient_variation.cbegin(), x_infos_gradient_variation.cend(),
[](const TensorInfo& info) { return !info.has_gradient; }))
// a gradient node cannot get created without any has_gradient node.
[](const TensorInfo& info) { return !info.has_gradient; })) {
continue;
}
// Compute theoretical Jacobian.
ORT_RETURN_IF_ERROR(ComputeTheoreticalJacobianTranspose(
op_def, x_infos_gradient_variation, y_infos, x_datas, y_datas, &jacobian_ts, attributes, add_shape, execution_providers));
ORT_RETURN_IF_ERROR(ComputeTheoreticalJacobianTranspose(op_def, x_infos_gradient_variation, y_infos, x_datas,
y_datas, &jacobian_ts, row_strides, col_strides,
attributes, add_shape, execution_providers));
// We have numeric jacobians regardless of has_gradient (computed once).
// We only have theoretical jacobians for those has_gradient.
// Theoretical jacobians are 0 for those not has_gradient.
int64_t j = 0;
size_t j = 0;
for (auto& x_info : x_infos_gradient_variation) {
const size_t x_size = static_cast<size_t>(x_info.shape.Size());
if (!x_info.has_gradient) {
// TODO: These 4 test failed at following ORT_ENFORCE. need investigate before enable it.
//GradientCheckerTest.MatMulGrad
//GradientCheckerTest.GemmGrad
//GradientCheckerTest.GatherNDGrad_repeat_float_data
//GradientCheckerTest.GatherNDGrad_unique_float_data
//auto jac_t = jacobian_ts[j];
//ORT_ENFORCE(std::all_of(
// GradientCheckerTest.MatMulGrad
// GradientCheckerTest.GemmGrad
// GradientCheckerTest.GatherNDGrad_repeat_float_data
// GradientCheckerTest.GatherNDGrad_unique_float_data
// auto jac_t = jacobian_ts[j];
// ORT_ENFORCE(std::all_of(
// &jac_t[0], &jac_t[0] + x_info.shape.Size(), [](auto dx) { return dx == 0; }));
j += x_info.shape.Size();
j += x_size;
} else {
for (int r = 0; r < x_info.shape.Size(); j++, r++) {
for (size_t r = 0; r < x_size; j++, r++) {
auto jac_t = jacobian_ts[j];
auto jac_n = jacobian_ns[j];
for (size_t k = 0; k < jac_t.size(); k++) {
@ -520,20 +433,16 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientErrorInternal(
}
}
}
return Status::OK();
}
template <typename X_T, typename Y_T, typename JAC_T>
inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientError(
const OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
JAC_T* max_error,
const std::vector<AttributeProto>& attributes,
bool check_not_have_gradient, /* = true*/
const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
JAC_T* max_error, const std::vector<AttributeProto>& attributes, bool check_not_have_gradient, /* = true*/
bool check_not_have_shape_inferencing /* = false*/,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers /* = nullptr */) {
// TODO: Consider varying mean and variance
float scale = 5.f;
float mean = 0.f;
@ -544,7 +453,7 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientError(
// Initialize 'x_datas' to random values.
std::vector<std::vector<X_T>> x_datas(x_infos.size());
for (size_t i = 0; i < x_infos.size(); i++) {
x_datas[i].resize(x_infos[i].shape.Size());
x_datas[i].resize(static_cast<size_t>(x_infos[i].shape.Size()));
if (x_infos[i].transformer) {
auto transformer = *x_infos[i].transformer;
@ -555,45 +464,34 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientError(
}
}
// Generate dummy placeholders with zero for y_datas
std::vector<std::vector<Y_T>> y_datas(y_infos.size());
for (size_t i = 0; i < y_infos.size(); i++) {
y_datas[i].resize(y_infos[i].shape.Size(), 0);
}
// Compute gradient error.
return ComputeGradientErrorInternal(op_def, x_infos, y_infos, &x_datas, &y_datas, max_error,
attributes, check_not_have_gradient, check_not_have_shape_inferencing, execution_providers);
return ComputeGradientError(op_def, x_infos, y_infos, max_error, x_datas, attributes, check_not_have_gradient,
check_not_have_shape_inferencing, execution_providers);
}
template <typename X_T, typename Y_T, typename JAC_T>
inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientError(
const OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
JAC_T* max_error,
std::vector<std::vector<X_T>> x_datas,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
bool check_not_have_gradient, /* = true*/
const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
JAC_T* max_error, std::vector<std::vector<X_T>> x_datas,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes, bool check_not_have_gradient, /* = true*/
bool check_not_have_shape_inferencing /* = false*/,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers /* = nullptr */) {
// Generate dummy placeholders with zero for y_datas
std::vector<std::vector<Y_T>> y_datas(y_infos.size());
for (size_t i = 0; i < y_infos.size(); i++) {
y_datas[i].resize(y_infos[i].shape.Size(), 0);
y_datas[i].resize(static_cast<size_t>(y_infos[i].shape.Size()), 0);
}
// Compute gradient error.
return ComputeGradientErrorInternal(op_def, x_infos, y_infos, &x_datas, &y_datas, max_error,
attributes, check_not_have_gradient, check_not_have_shape_inferencing, execution_providers);
return ComputeGradientErrorInternal(op_def, x_infos, y_infos, &x_datas, &y_datas, max_error, attributes,
check_not_have_gradient, check_not_have_shape_inferencing, execution_providers);
}
#define INSTANTIATE_GRAD_ERR_TYPE(X_T, Y_T, JAC_T) \
template class GradientChecker<X_T, Y_T, JAC_T>;
#define INSTANTIATE_GRAD_ERR_TYPE(X_T, Y_T, JAC_T) template class GradientChecker<X_T, Y_T, JAC_T>;
INSTANTIATE_GRAD_ERR_TYPE(float, float, float);
INSTANTIATE_GRAD_ERR_TYPE(double, double, double);
#undef INSTANTIATE_GRAD_ERR_TYPE
} // namespace test
} // namespace onnxruntime

View file

@ -23,8 +23,7 @@ namespace onnxruntime {
namespace test {
struct TensorInfo {
TensorInfo(std::initializer_list<int64_t> shape_init,
bool has_gradient = true,
TensorInfo(std::initializer_list<int64_t> shape_init, bool has_gradient = true,
std::function<float(float)>* transformer = nullptr,
MLDataType data_type = DataTypeImpl::GetTensorType<float>(),
const std::vector<std::string>& dim_params = std::vector<std::string>{})
@ -34,9 +33,7 @@ struct TensorInfo {
data_type(data_type),
dim_params(dim_params) {}
TensorInfo(const TensorShape& shape,
bool has_gradient = true,
std::function<float(float)>* transformer = nullptr,
TensorInfo(const TensorShape& shape, bool has_gradient = true, std::function<float(float)>* transformer = nullptr,
MLDataType data_type = DataTypeImpl::GetTensorType<float>())
: shape(shape), has_gradient(has_gradient), transformer(transformer), data_type(data_type) {}
@ -66,89 +63,71 @@ class GradientChecker {
///
/// if y = Square(x), where x (and so y) are DT_DOUBLE,
/// <X_T, Y_T, JAC_T> should be <double, double, double>
Status ComputeGradientError(
const training::OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
JAC_T* max_error,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes = {},
// TODO: Ideally it shall check for not has_gradient cases. But some tests are failing
// because the gradient op does not handle the case. We have to use this flag
// to disable check for not having gradient cases in order to pass those test.
// Remove this flag when the gradient op is fixed.
bool check_not_have_gradient = true,
// Also check gradient builder for op for cases where input shapes are not available
bool check_not_have_shape_inferencing = false,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
Status ComputeGradientError(const training::OpDef& op_def, const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos, JAC_T* max_error,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes = {},
// TODO: Ideally it shall check for not has_gradient cases. But some tests are failing
// because the gradient op does not handle the case. We have to use this flag
// to disable check for not having gradient cases in order to pass those test.
// Remove this flag when the gradient op is fixed.
bool check_not_have_gradient = true,
// Also check gradient builder for op for cases where input shapes are not available
bool check_not_have_shape_inferencing = false,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
Status ComputeGradientError(
const training::OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
JAC_T* max_error,
std::vector<std::vector<X_T>> x_datas,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes = {},
// TODO: Ideally it shall check for not has_gradient cases. But some tests are failing
// because the gradient op does not handle the case. We have to use this flag
// to disable check for not having gradient cases in order to pass those test.
// Remove this flag when the gradient op is fixed.
bool check_not_have_gradient = true,
// Also check gradient builder for op for cases where input shapes are not available
bool check_not_have_shape_inferencing = false,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
Status ComputeGradientError(const training::OpDef& op_def, const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos, JAC_T* max_error,
std::vector<std::vector<X_T>> x_datas,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes = {},
// TODO: Ideally it shall check for not has_gradient cases. But some tests are failing
// because the gradient op does not handle the case. We have to use this flag
// to disable check for not having gradient cases in order to pass those test.
// Remove this flag when the gradient op is fixed.
bool check_not_have_gradient = true,
// Also check gradient builder for op for cases where input shapes are not available
bool check_not_have_shape_inferencing = false,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
private:
Status InitJacobians(const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<JAC_T>>* jacobians);
void InitJacobians(size_t row_count, size_t col_count, std::vector<std::vector<JAC_T>>* jacobians);
std::vector<OrtValue> EvaluateFunctionAtInput(OpTester& op_tester,
const std::vector<TensorInfo>& x_infos,
void AddDatas(OpTester& op_session, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas);
std::vector<OrtValue> EvaluateFunctionAtInput(OpTester& op_tester, const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas);
Status InitOpTesterWithGraph(OpTester& op_tester,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
Status InitOpTesterWithGraph(OpTester& op_tester, const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos, std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
const std::unordered_map<std::string, int>& extra_domain_to_version = {});
Status InitOpTesterWithGradGraph(OpTester& op_tester,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
Status InitOpTesterWithGradGraph(OpTester& op_tester, const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos, std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes);
Status ComputeTheoreticalJacobianTranspose(const training::OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
std::vector<std::vector<JAC_T>>* jacobian_ts,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
bool add_shape = true,
std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
Status ComputeTheoreticalJacobianTranspose(
const training::OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
std::vector<std::vector<JAC_T>>* jacobian_ts, const std::vector<size_t>& row_strides,
const std::vector<size_t>& col_strides, const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
bool add_shape = true, std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
Status ComputeNumericJacobianTranspose(const training::OpDef& op_def,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
const JAC_T delta,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
Status ComputeNumericJacobianTranspose(const training::OpDef& op_def, const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos, const JAC_T delta,
std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
std::vector<std::vector<JAC_T>>* jacobian_ts,
const std::vector<size_t>& row_strides, const std::vector<size_t>& col_strides,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
bool add_shape = true);
Status ComputeGradientErrorInternal(const training::OpDef& op_name,
const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos,
std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas,
JAC_T* max_error,
Status ComputeGradientErrorInternal(const training::OpDef& op_name, const std::vector<TensorInfo>& x_infos,
const std::vector<TensorInfo>& y_infos, std::vector<std::vector<X_T>>* x_datas,
std::vector<std::vector<Y_T>>* y_datas, JAC_T* max_error,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
bool check_not_have_gradient = true,
bool check_not_have_shape_inferencing = false,

File diff suppressed because it is too large Load diff