diff --git a/orttraining/orttraining/test/gradient/gradient_checker.cc b/orttraining/orttraining/test/gradient/gradient_checker.cc
index 417fc2861e..6144bc80eb 100644
--- a/orttraining/orttraining/test/gradient/gradient_checker.cc
+++ b/orttraining/orttraining/test/gradient/gradient_checker.cc
@@ -15,18 +15,38 @@ limitations under the License.
 
 /* Modifications Copyright (c) Microsoft. */
 
-#include "gradient_checker.h"
-#include "gradient_op_test_utils.h"
+#include "orttraining/test/gradient/gradient_checker.h"
+
+#include <random>
+#include "orttraining/test/gradient/gradient_op_test_utils.h"
 #include "orttraining/core/framework/gradient_graph_builder.h"
 #include "orttraining/core/graph/gradient_config.h"
 #include "test/util/include/test_random_seed.h"
-#include <random>
+#include "test/util/include/default_providers.h"
+
 namespace onnxruntime {
 namespace test {
 
 using ONNX_NAMESPACE::AttributeProto;
 using training::OpDef;
 
+namespace {
+
+std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(bool cpu_only = false) {
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  if (cpu_only) return execution_providers;
+#ifdef USE_CUDA
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+#endif
+#ifdef USE_ROCM
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
+  return execution_providers;
+}
+
+};  // namespace
+
 // The jacobian transpose matrix is laid out as follows
 
 // Say there are three inputs each of size M X N, N X K, K X J
@@ -37,131 +57,118 @@ using training::OpDef;
 //       |                          N X K    |        |      |
 //       |                          K X J    |        |      |
 //       V
-
-std::pair<int, int> inline CalculateJacobianTransposeIndex(const std::vector<TensorInfo>& x_infos,
-                                                           int x_input_index,
-                                                           int x_flattened_index,
-                                                           const std::vector<TensorInfo>& y_infos,
-                                                           int y_output_index,
-                                                           int y_flattened_index) {
-  int64_t elems_in_prev_output_tensors = 0;
-  for (int i = 0; i < y_output_index; i++) {
-    elems_in_prev_output_tensors += y_infos[i].shape.Size();
+// The Jacobian is always a real-valued matrix.
+// Given y = f(x) for tensors y and x, it contains the derivatives dy_i/dx_j for
+// every pair y_i in y and x_j in x.  Note that the Jacobian is defined directly
+// over the elements of tensors y and x, and doesn't depend on their shapes.
+//
+// If x = (x_1, x_2, ..., x_m) and y = (y_1, y_2, .., y_n) the matrix evaluated
+// is actually the Jacobian transpose, defined as this mxn matrix:
+// dy_1/d_x1 dy_2/dx_1 ... dy_n/dx_1
+// dy_1/dx_2 dy_2/dx_2 ... dy_n/dx_2
+//     .
+//     .
+//     .
+// dy_1/dx_m dy_2/dx_m ... dy_n/dx_m
+template <typename X_T, typename Y_T, typename JAC_T>
+inline void GradientChecker<X_T, Y_T, JAC_T>::InitJacobians(size_t row_count, size_t col_count,
+                                                            std::vector<std::vector<JAC_T>>* jacobians) {
+  // the number of rows is equal to total number of scalar input values in all of input vectors
+  jacobians->resize(row_count);
+  // the number of cols is equal to total number of scalar output values in all of output vectors
+  for (size_t i = 0; i < row_count; ++i) {
+    (*jacobians)[i] = std::vector<JAC_T>(col_count, 0);
   }
-
-  int64_t col = elems_in_prev_output_tensors + y_flattened_index;
-
-  int64_t elems_in_prev_input_tensors = 0;
-  for (int i = 0; i < x_input_index; i++) {
-    elems_in_prev_input_tensors += x_infos[i].shape.Size();
-  }
-
-  int64_t row = elems_in_prev_input_tensors + x_flattened_index;
-
-  return {gsl::narrow_cast<int>(row), gsl::narrow_cast<int>(col)};
 }
 
 template <typename X_T, typename Y_T, typename JAC_T>
 inline std::vector<OrtValue> GradientChecker<X_T, Y_T, JAC_T>::EvaluateFunctionAtInput(
-    OpTester& op_session,
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    std::vector<std::vector<X_T>>* x_datas,
-    std::vector<std::vector<Y_T>>* y_datas) {
-  // clear OpTester input/output/initializer_index
-  op_session.ClearData();
+    OpTester& op_session, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+    std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas) {
+  AddDatas(op_session, x_infos, y_infos, x_datas, y_datas);
 
-  for (size_t data_index = 0; data_index < x_datas->size(); data_index++) {
+  // If EPs is not set, the OpTester will run over all possible EPs and keep the outputs of last run as the
+  // actual output data, which is time wasting. What we need is the forward graph outputs for numeric Jacobian,
+  // using CPU EP only is enough.
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers = GetExecutionProviders(true);
+  op_session.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  return op_session.GetFetches();
+}
+
+template <typename X_T, typename Y_T, typename JAC_T>
+inline void GradientChecker<X_T, Y_T, JAC_T>::AddDatas(OpTester& op_session, const std::vector<TensorInfo>& x_infos,
+                                                       const std::vector<TensorInfo>& y_infos,
+                                                       std::vector<std::vector<X_T>>* x_datas,
+                                                       std::vector<std::vector<Y_T>>* y_datas) {
+  op_session.ClearData();
+  for (size_t data_index = 0; data_index < x_datas->size(); ++data_index) {
     std::string name = "input" + std::to_string(data_index);
     const std::vector<X_T>& data = (*x_datas)[data_index];
 
     if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
       std::vector<int64_t> int64_data(data.size());
       std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast<int64_t>(x); });
-      op_session.AddInput<int64_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int64_data);
+      op_session.AddInput<int64_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int64_data, false,
+                                   &x_infos[data_index].dim_params);
     } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int32_t>()) {
       std::vector<int32_t> int32_data(data.size());
       std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast<int32_t>(x); });
-      op_session.AddInput<int32_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int32_data);
+      op_session.AddInput<int32_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int32_data, false,
+                                   &x_infos[data_index].dim_params);
     } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<bool>()) {
       std::unique_ptr<bool[]> p_data(new bool[data.size()]);
       for (size_t i = 0; i < data.size(); ++i) {
         p_data[i] = static_cast<bool>(data[i]);
       }
-      op_session.AddInput<bool>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), p_data.get(), data.size());
+      op_session.AddInput<bool>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), p_data.get(), data.size(),
+                                false, &x_infos[data_index].dim_params);
     } else {
-      op_session.AddInput<X_T>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), data);
+      op_session.AddInput<X_T>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), data, false,
+                               &x_infos[data_index].dim_params);
     }
   }
 
-  for (size_t data_index = 0; data_index < y_infos.size(); data_index++) {
+  for (size_t data_index = 0; data_index < y_infos.size(); ++data_index) {
     std::string name = "output" + std::to_string(data_index);
-    op_session.AddOutput<Y_T>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), (*y_datas)[data_index]);
+    const std::vector<Y_T>& data = (*y_datas)[data_index];
+
+    if (y_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
+      std::vector<int64_t> int64_data(data.size());
+      std::transform(data.begin(), data.end(), int64_data.begin(), [](Y_T x) { return static_cast<int64_t>(x); });
+      op_session.AddOutput<int64_t>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), int64_data);
+    } else {
+      op_session.AddOutput<Y_T>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), data);
+    }
   }
-  op_session.Run();
-  return op_session.GetFetches();
 }
 
 template <typename X_T, typename Y_T, typename JAC_T>
 inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeTheoreticalJacobianTranspose(
-    const OpDef& op_def,
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    std::vector<std::vector<X_T>>* x_datas,
-    std::vector<std::vector<Y_T>>* y_datas,
-    std::vector<std::vector<JAC_T>>* jacobian_ts,
-    const std::vector<AttributeProto>& attributes,
-    bool add_shape,
+    const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+    std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
+    std::vector<std::vector<JAC_T>>* jacobian_ts, const std::vector<size_t>& row_strides,
+    const std::vector<size_t>& col_strides, const std::vector<AttributeProto>& attributes, bool add_shape,
     std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers /* nullptr*/) {
   size_t y_num = y_infos.size();
   size_t x_num = x_infos.size();
-
   // build the graph once and reuse it later in the looping logic
-  GradientOpTester op_session(op_def.type.c_str(), x_infos, y_infos, op_def.opset_version, op_def.domain.c_str(), false);
+  GradientOpTester op_session(op_def.type.c_str(), x_infos, y_infos, op_def.opset_version, op_def.domain.c_str(),
+                              false);
   op_session.AddShapeToTensorData(add_shape);
   ORT_RETURN_IF_ERROR(InitOpTesterWithGradGraph(op_session, x_infos, y_infos, x_datas, y_datas, attributes));
 
   // currently only supported scalar valued fns - and complex types are not supported
-  for (int y_idx = 0; y_idx < static_cast<int>(y_num); y_idx++) {  // for each dy input
+  for (size_t y_idx = 0; y_idx < y_num; y_idx++) {  // for each dy input
     if (!y_infos[y_idx].has_gradient) {
       continue;
     }
 
-    const size_t dy_size = y_infos[y_idx].shape.Size();
+    const size_t dy_size = static_cast<size_t>(y_infos[y_idx].shape.Size());
 
     // Compute the theoretical Jacobians one row at a time by back propagating
     // '1.0' for each element of 'dy', while holding all other elements of 'dy' at zero.
     for (size_t c = 0; c < dy_size; ++c) {  // for each value in the dy input vector
-      // clear OpTester input/output/initializer
-      op_session.ClearData();
-
-      for (size_t data_index = 0; data_index < x_num; data_index++) {
-        std::string name = "input" + std::to_string(data_index);
-        const std::vector<X_T>& data = (*x_datas)[data_index];
-
-        if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
-          std::vector<int64_t> int64_data(data.size());
-          std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast<int64_t>(x); });
-          op_session.AddInput<int64_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int64_data);
-        } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int32_t>()) {
-          std::vector<int32_t> int32_data(data.size());
-          std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast<int32_t>(x); });
-          op_session.AddInput<int32_t>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), int32_data);
-        } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<bool>()) {
-          std::unique_ptr<bool[]> p_data(new bool[data.size()]);
-          for (size_t i = 0; i < data.size(); ++i) {
-            p_data[i] = static_cast<bool>(data[i]);
-          }
-          op_session.AddInput<bool>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), p_data.get(), data.size());
-        } else {
-          op_session.AddInput<X_T>(name.c_str(), x_infos[data_index].shape.AsShapeVector(), data);
-        }
-      }
-
-      for (size_t data_index = 0; data_index < y_num; data_index++) {
-        std::string name = "output" + std::to_string(data_index);
-        op_session.AddOutput<Y_T>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), (*y_datas)[data_index]);
-      }
+      AddDatas(op_session, x_infos, y_infos, x_datas, y_datas);
 
       // While calculating theoritical jacobian transpose we calculate the gradient by
       // setting back propogating one element of dY at a time and setting everything else to zero
@@ -169,98 +176,45 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeTheoreticalJacobianTransp
       // inputs is treated as a vector of vectors. The parameters of the function call below, y_idx and c
       // corresponding to which input (dy1, dy2..etc) and which value of the input (dy_flattened_vector[c]]
       // to pertrub to 1.
-
-      op_session.Run(y_idx, static_cast<int>(c), OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, execution_providers);
+      if (execution_providers) {
+        op_session.Run(static_cast<int>(y_idx), static_cast<int>(c), OpTester::ExpectResult::kExpectSuccess, "", {},
+                       nullptr, execution_providers);
+      } else {
+        // If EPs is not set, the OpTester will run over all possible EPs and keep the outputs of last run as the
+        // actual output data, which is time wasting. So if caller doesn't pass in the EPs, we will use the default
+        // EPs according to the environment.
+        std::vector<std::unique_ptr<IExecutionProvider>> default_eps = GetExecutionProviders();
+        op_session.Run(static_cast<int>(y_idx), static_cast<int>(c), OpTester::ExpectResult::kExpectSuccess, "", {},
+                       nullptr, &default_eps);
+      }
       auto gradients = op_session.GetFetches();
 
-      for (int x_idx = 0, grad_idx = 0; x_idx < static_cast<int>(x_num); x_idx++) {
+      for (size_t x_idx = 0, grad_idx = 0; x_idx < x_num; x_idx++) {
         if (!x_infos[x_idx].has_gradient) {
           continue;
         }
 
-        const int64_t x_size = x_infos[x_idx].shape.Size();
+        const size_t x_size = static_cast<size_t>(x_infos[x_idx].shape.Size());
         auto dx_flat = gradients[grad_idx].Get<Tensor>().Data<X_T>();
         grad_idx++;
 
-        for (int r = 0; r < static_cast<int>(x_size); ++r) {
-          auto calc_index = CalculateJacobianTransposeIndex(
-              x_infos,
-              x_idx,
-              r,
-              y_infos,
-              y_idx,
-              static_cast<int>(c));
-          (*jacobian_ts)[calc_index.first][calc_index.second] = dx_flat[r];
+        for (size_t r = 0; r < x_size; ++r) {
+          (*jacobian_ts)[row_strides[x_idx] + r][col_strides[y_idx] + c] = dx_flat[r];
         }
       }
     }
   }
+
   return Status::OK();
 }
 
 template <typename X_T, typename Y_T, typename JAC_T>
 inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGraph(
-    OpTester& op_session,
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    std::vector<std::vector<X_T>>* x_datas,
-    std::vector<std::vector<Y_T>>* y_datas,
+    OpTester& op_session, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+    std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
     const std::vector<AttributeProto>& attributes,
     const std::unordered_map<std::string, int>& extra_domain_to_version) {
-  for (size_t data_index = 0; data_index < x_datas->size(); data_index++) {
-    std::string name = "input" + std::to_string(data_index);
-    const std::vector<X_T>& data = (*x_datas)[data_index];
-
-    if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
-      std::vector<int64_t> int64_data(data.size());
-      std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast<int64_t>(x); });
-      op_session.AddInput<int64_t>(name.c_str(),
-                                   x_infos[data_index].shape.AsShapeVector(),
-                                   int64_data,
-                                   false,
-                                   &x_infos[data_index].dim_params);
-    } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<int32_t>()) {
-      std::vector<int32_t> int32_data(data.size());
-      std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast<int32_t>(x); });
-      op_session.AddInput<int32_t>(name.c_str(),
-                                   x_infos[data_index].shape.AsShapeVector(),
-                                   int32_data,
-                                   false,
-                                   &x_infos[data_index].dim_params);
-    } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType<bool>()) {
-      std::unique_ptr<bool[]> p_data(new bool[data.size()]);
-      for (size_t i = 0; i < data.size(); ++i) {
-        p_data[i] = static_cast<bool>(data[i]);
-      }
-      op_session.AddInput<bool>(name.c_str(),
-                                x_infos[data_index].shape.AsShapeVector(),
-                                p_data.get(),
-                                data.size(),
-                                false,
-                                &x_infos[data_index].dim_params);
-    } else {
-      op_session.AddInput<X_T>(name.c_str(),
-                               x_infos[data_index].shape.AsShapeVector(),
-                               data,
-                               false,
-                               &x_infos[data_index].dim_params);
-    }
-  }
-
-  for (size_t data_index = 0; data_index < y_infos.size(); data_index++) {
-    std::string name = "output" + std::to_string(data_index);
-    const std::vector<Y_T>& data = (*y_datas)[data_index];
-
-    if (y_infos[data_index].data_type == DataTypeImpl::GetTensorType<int64_t>()) {
-      std::vector<int64_t> int64_data(data.size());
-      std::transform(data.begin(), data.end(), int64_data.begin(), [](Y_T x) { return static_cast<int64_t>(x); });
-      op_session.AddOutput<int64_t>(name.c_str(),
-                                    y_infos[data_index].shape.AsShapeVector(),
-                                    int64_data);
-    } else {
-      op_session.AddOutput<Y_T>(name.c_str(), y_infos[data_index].shape.AsShapeVector(), data);
-    }
-  }
+  AddDatas(op_session, x_infos, y_infos, x_datas, y_datas);
   // Currently only allows setting int attributes to zero. TODO: Expand this
   for (auto attr : attributes) {
     op_session.AddAttributeProto(attr);
@@ -291,15 +245,12 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGraph(
 
 template <typename X_T, typename Y_T, typename JAC_T>
 inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGradGraph(
-    OpTester& op_session,
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    std::vector<std::vector<X_T>>* x_datas,
-    std::vector<std::vector<Y_T>>* y_datas,
+    OpTester& op_session, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+    std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
     const std::vector<AttributeProto>& attributes) {
   std::unordered_map<std::string, int> extra_domain_to_version{{kMSDomain, 1}, {kOnnxDomain, 9}};
-  ORT_RETURN_IF_ERROR(InitOpTesterWithGraph(op_session, x_infos, y_infos, x_datas, y_datas, attributes,
-                                            extra_domain_to_version));
+  ORT_RETURN_IF_ERROR(
+      InitOpTesterWithGraph(op_session, x_infos, y_infos, x_datas, y_datas, attributes, extra_domain_to_version));
   // build grad graph
   auto p_model = op_session.GetModelCache();
   auto& graph = p_model->MainGraph();
@@ -320,11 +271,7 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGradGraph(
 
   training::GradientGraphConfiguration gradient_graph_config;
   gradient_graph_config.set_gradients_as_graph_outputs = true;
-  training::GradientGraphBuilder grad_graph_builder(&graph,
-                                                    dy_values,
-                                                    weights_to_train,
-                                                    "",
-                                                    gradient_graph_config,
+  training::GradientGraphBuilder grad_graph_builder(&graph, dy_values, weights_to_train, "", gradient_graph_config,
                                                     logging::LoggingManager::DefaultLogger());
   Status status = grad_graph_builder.Build();
   EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
@@ -334,15 +281,10 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::InitOpTesterWithGradGraph(
 
 template <typename X_T, typename Y_T, typename JAC_T>
 inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeNumericJacobianTranspose(
-    const OpDef& op_def,
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    const JAC_T delta,
-    std::vector<std::vector<X_T>>* x_datas,
-    std::vector<std::vector<Y_T>>* y_datas,
-    std::vector<std::vector<JAC_T>>* jacobian_ts,
-    const std::vector<AttributeProto>& attributes,
-    bool add_shape) {
+    const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+    const JAC_T delta, std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
+    std::vector<std::vector<JAC_T>>* jacobian_ts, const std::vector<size_t>& row_strides,
+    const std::vector<size_t>& col_strides, const std::vector<AttributeProto>& attributes, bool add_shape) {
   size_t y_num = y_infos.size();
   size_t x_num = x_infos.size();
   X_T x_delta = static_cast<X_T>(delta);
@@ -352,17 +294,17 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeNumericJacobianTranspose(
   op_session.AddShapeToTensorData(add_shape);
   ORT_RETURN_IF_ERROR(InitOpTesterWithGraph(op_session, x_infos, y_infos, x_datas, y_datas, attributes));
 
-  for (int x_idx = 0; x_idx < static_cast<int>(x_num); x_idx++) {
+  for (size_t x_idx = 0; x_idx < x_num; ++x_idx) {
     if (!x_infos[x_idx].has_gradient) {
       continue;
     }
 
-    const int64_t x_size = x_infos[x_idx].shape.Size();
+    const size_t x_size = static_cast<size_t>(x_infos[x_idx].shape.Size());
 
     // Compute the numeric Jacobian one column at a time by perturbing each
     // element of 'x_data' (positively and negatively) by 'delta', and
     // updating the jacobian with the centered difference
-    for (int r = 0; r < x_size; ++r) {
+    for (size_t r = 0; r < x_size; ++r) {
       // Store current value of 'x' at 'r'.
       X_T v = (*x_datas)[x_idx][r];
 
@@ -374,89 +316,55 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeNumericJacobianTranspose(
       (*x_datas)[x_idx][r] = v - x_delta;
       std::vector<OrtValue> y_minus = EvaluateFunctionAtInput(op_session, x_infos, y_infos, x_datas, y_datas);
 
-      for (int y_idx = 0; y_idx < static_cast<int>(y_num); y_idx++) {
+      for (size_t y_idx = 0; y_idx < y_num; ++y_idx) {
         if (!y_infos[y_idx].has_gradient) {
           continue;
         }
         // Compute element-wise centered difference and store in each Jacobian.
         auto y_plus_flat = y_plus[y_idx].Get<Tensor>().Data<Y_T>();
         auto y_minus_flat = y_minus[y_idx].Get<Tensor>().Data<Y_T>();
-        const int64_t y_size = y_infos[y_idx].shape.Size();
+        const size_t y_size = static_cast<size_t>(y_infos[y_idx].shape.Size());
         const Y_T scale = static_cast<Y_T>(2 * delta);
-        for (int c = 0; c < y_size; ++c) {
-          auto calc_index = CalculateJacobianTransposeIndex(
-              x_infos,
-              x_idx,
-              r,
-              y_infos,
-              y_idx,
-              c);
-          (*jacobian_ts)[calc_index.first][calc_index.second] = (y_plus_flat[c] - y_minus_flat[c]) / scale;
+        for (size_t c = 0; c < y_size; ++c) {
+          (*jacobian_ts)[row_strides[x_idx] + r][col_strides[y_idx] + c] = (y_plus_flat[c] - y_minus_flat[c]) / scale;
         }
       }
+
       // Restore pre-perturbation value.
       (*x_datas)[x_idx][r] = v;
     }
   }
+
   return Status::OK();
 }
 
-//// The Jacobian is always a real-valued matrix.
-//// Given y = f(x) for tensors y and x, it contains the derivatives dy_i/dx_j for
-//// every pair y_i in y and x_j in x.  Note that the Jacobian is defined directly
-//// over the elements of tensors y and x, and doesn't depend on their shapes.
-////
-//// If x = (x_1, x_2, ..., x_m) and y = (y_1, y_2, .., y_n) the matrix evaluated
-//// is actually the Jacobian transpose, defined as this mxn matrix:
-//// dy_1/d_x1 dy_2/dx_1 ... dy_n/dx_1
-//// dy_1/dx_2 dy_2/dx_2 ... dy_n/dx_2
-////     .
-////     .
-////     .
-//// dy_1/dx_m dy_2/dx_m ... dy_n/dx_m
-template <typename X_T, typename Y_T, typename JAC_T>
-inline Status GradientChecker<X_T, Y_T, JAC_T>::InitJacobians(
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    std::vector<std::vector<JAC_T>>* jacobians) {
-  // the number of rows is equal to total number of scalar input values in all of input vectors
-  int64_t rows = 0;
-  for (size_t i = 0; i < x_infos.size(); i++) {
-    rows += x_infos[i].shape.Size();  // 'S'ize gives the total number of elements in all dims while 's'ize just gives num_dims
-  }
-  jacobians->resize(gsl::narrow_cast<int>(rows));
-
-  // the number of cols is equal to total number of scalar output values in all of output vectors
-  int64_t cols = 0;
-  for (size_t i = 0; i < y_infos.size(); i++) {
-    cols += y_infos[i].shape.Size();
-  }
-
-  for (size_t i = 0; i < jacobians->size(); i++) {
-    (*jacobians)[i] = std::vector<JAC_T>(gsl::narrow_cast<int>(cols), 0);
-  }
-
-  return Status().OK();
-}
-
 template <typename X_T, typename Y_T, typename JAC_T>
 inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientErrorInternal(
-    const OpDef& op_def,
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    std::vector<std::vector<X_T>>* x_datas,
-    std::vector<std::vector<Y_T>>* y_datas,
-    JAC_T* max_error,
-    const std::vector<AttributeProto>& attributes,
-    bool check_not_have_gradient,
-    bool check_not_have_shape_inferencing,
+    const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+    std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas, JAC_T* max_error,
+    const std::vector<AttributeProto>& attributes, bool check_not_have_gradient, bool check_not_have_shape_inferencing,
     std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers /* nullptr */) {
+  std::vector<size_t> row_strides(x_infos.size());
+  std::vector<size_t> col_strides(y_infos.size());
+  size_t row_count = 0;
+  for (size_t i = 0; i < x_infos.size(); ++i) {
+    row_strides[i] = row_count;
+    row_count += static_cast<size_t>(x_infos[i].shape.Size());
+  }
+
+  size_t col_count = 0;
+  for (size_t i = 0; i < y_infos.size(); ++i) {
+    col_strides[i] = col_count;
+    col_count += static_cast<size_t>(y_infos[i].shape.Size());
+  }
+
   // Initialize numeric Jacobian to zeros.
   std::vector<std::vector<JAC_T>> jacobian_ns;
-  ORT_RETURN_IF_ERROR(InitJacobians(x_infos, y_infos, &jacobian_ns));
+  InitJacobians(row_count, col_count, &jacobian_ns);
+
   // Compute numeric Jacobian.
-  ORT_RETURN_IF_ERROR(ComputeNumericJacobianTranspose(
-      op_def, x_infos, y_infos, JAC_T{1e-3f}, x_datas, y_datas, &jacobian_ns, attributes));
+  ORT_RETURN_IF_ERROR(ComputeNumericJacobianTranspose(op_def, x_infos, y_infos, JAC_T{1e-3f}, x_datas, y_datas,
+                                                      &jacobian_ns, row_strides, col_strides, attributes));
 
   // Compute the maximum error between theoretical and numeric Jacobians.
   *max_error = 0.0;
@@ -471,37 +379,42 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientErrorInternal(
     for (size_t x_gradient_variation = 0; x_gradient_variation < total_gradient_variations; x_gradient_variation++) {
       // Initialize theoretical Jacobians to zeros.
       std::vector<std::vector<JAC_T>> jacobian_ts;
-      ORT_RETURN_IF_ERROR(InitJacobians(x_infos, y_infos, &jacobian_ts));
+      InitJacobians(row_count, col_count, &jacobian_ts);
 
       std::vector<TensorInfo> x_infos_gradient_variation = x_infos;
-
-      if (check_not_have_gradient && x_gradient_variation < x_infos.size())
+      if (check_not_have_gradient && x_gradient_variation < x_infos.size()) {
         x_infos_gradient_variation[x_gradient_variation].has_gradient = false;
+      }
 
+      // a gradient node cannot get created without any has_gradient node.
       if (std::all_of(x_infos_gradient_variation.cbegin(), x_infos_gradient_variation.cend(),
-                      [](const TensorInfo& info) { return !info.has_gradient; }))
-        // a gradient node cannot get created without any has_gradient node.
+                      [](const TensorInfo& info) { return !info.has_gradient; })) {
         continue;
+      }
+
       // Compute theoretical Jacobian.
-      ORT_RETURN_IF_ERROR(ComputeTheoreticalJacobianTranspose(
-          op_def, x_infos_gradient_variation, y_infos, x_datas, y_datas, &jacobian_ts, attributes, add_shape, execution_providers));
+      ORT_RETURN_IF_ERROR(ComputeTheoreticalJacobianTranspose(op_def, x_infos_gradient_variation, y_infos, x_datas,
+                                                              y_datas, &jacobian_ts, row_strides, col_strides,
+                                                              attributes, add_shape, execution_providers));
+
       // We have numeric jacobians regardless of has_gradient (computed once).
       // We only have theoretical jacobians for those has_gradient.
       // Theoretical jacobians are 0 for those not has_gradient.
-      int64_t j = 0;
+      size_t j = 0;
       for (auto& x_info : x_infos_gradient_variation) {
+        const size_t x_size = static_cast<size_t>(x_info.shape.Size());
         if (!x_info.has_gradient) {
           // TODO: These 4 test failed at following ORT_ENFORCE. need investigate before enable it.
-          //GradientCheckerTest.MatMulGrad
-          //GradientCheckerTest.GemmGrad
-          //GradientCheckerTest.GatherNDGrad_repeat_float_data
-          //GradientCheckerTest.GatherNDGrad_unique_float_data
-          //auto jac_t = jacobian_ts[j];
-          //ORT_ENFORCE(std::all_of(
+          // GradientCheckerTest.MatMulGrad
+          // GradientCheckerTest.GemmGrad
+          // GradientCheckerTest.GatherNDGrad_repeat_float_data
+          // GradientCheckerTest.GatherNDGrad_unique_float_data
+          // auto jac_t = jacobian_ts[j];
+          // ORT_ENFORCE(std::all_of(
           //    &jac_t[0], &jac_t[0] + x_info.shape.Size(), [](auto dx) { return dx == 0; }));
-          j += x_info.shape.Size();
+          j += x_size;
         } else {
-          for (int r = 0; r < x_info.shape.Size(); j++, r++) {
+          for (size_t r = 0; r < x_size; j++, r++) {
             auto jac_t = jacobian_ts[j];
             auto jac_n = jacobian_ns[j];
             for (size_t k = 0; k < jac_t.size(); k++) {
@@ -520,20 +433,16 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientErrorInternal(
       }
     }
   }
+
   return Status::OK();
 }
 
 template <typename X_T, typename Y_T, typename JAC_T>
 inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientError(
-    const OpDef& op_def,
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    JAC_T* max_error,
-    const std::vector<AttributeProto>& attributes,
-    bool check_not_have_gradient, /* = true*/
+    const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+    JAC_T* max_error, const std::vector<AttributeProto>& attributes, bool check_not_have_gradient, /* = true*/
     bool check_not_have_shape_inferencing /* = false*/,
     std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers /* = nullptr */) {
-
   // TODO: Consider varying mean and variance
   float scale = 5.f;
   float mean = 0.f;
@@ -544,7 +453,7 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientError(
   // Initialize 'x_datas' to random values.
   std::vector<std::vector<X_T>> x_datas(x_infos.size());
   for (size_t i = 0; i < x_infos.size(); i++) {
-    x_datas[i].resize(x_infos[i].shape.Size());
+    x_datas[i].resize(static_cast<size_t>(x_infos[i].shape.Size()));
 
     if (x_infos[i].transformer) {
       auto transformer = *x_infos[i].transformer;
@@ -555,45 +464,34 @@ inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientError(
     }
   }
 
-  // Generate dummy placeholders with zero for y_datas
-  std::vector<std::vector<Y_T>> y_datas(y_infos.size());
-  for (size_t i = 0; i < y_infos.size(); i++) {
-    y_datas[i].resize(y_infos[i].shape.Size(), 0);
-  }
-
-  // Compute gradient error.
-  return ComputeGradientErrorInternal(op_def, x_infos, y_infos, &x_datas, &y_datas, max_error,
-                                      attributes, check_not_have_gradient, check_not_have_shape_inferencing, execution_providers);
+  return ComputeGradientError(op_def, x_infos, y_infos, max_error, x_datas, attributes, check_not_have_gradient,
+                              check_not_have_shape_inferencing, execution_providers);
 }
 
 template <typename X_T, typename Y_T, typename JAC_T>
 inline Status GradientChecker<X_T, Y_T, JAC_T>::ComputeGradientError(
-    const OpDef& op_def,
-    const std::vector<TensorInfo>& x_infos,
-    const std::vector<TensorInfo>& y_infos,
-    JAC_T* max_error,
-    std::vector<std::vector<X_T>> x_datas,
-    const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
-    bool check_not_have_gradient, /* = true*/
+    const OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+    JAC_T* max_error, std::vector<std::vector<X_T>> x_datas,
+    const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes, bool check_not_have_gradient, /* = true*/
     bool check_not_have_shape_inferencing /* = false*/,
     std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers /* = nullptr */) {
-
   // Generate dummy placeholders with zero for y_datas
   std::vector<std::vector<Y_T>> y_datas(y_infos.size());
   for (size_t i = 0; i < y_infos.size(); i++) {
-    y_datas[i].resize(y_infos[i].shape.Size(), 0);
+    y_datas[i].resize(static_cast<size_t>(y_infos[i].shape.Size()), 0);
   }
 
   // Compute gradient error.
-  return ComputeGradientErrorInternal(op_def, x_infos, y_infos, &x_datas, &y_datas, max_error,
-                                      attributes, check_not_have_gradient, check_not_have_shape_inferencing, execution_providers);
+  return ComputeGradientErrorInternal(op_def, x_infos, y_infos, &x_datas, &y_datas, max_error, attributes,
+                                      check_not_have_gradient, check_not_have_shape_inferencing, execution_providers);
 }
 
-#define INSTANTIATE_GRAD_ERR_TYPE(X_T, Y_T, JAC_T) \
-  template class GradientChecker<X_T, Y_T, JAC_T>;
+#define INSTANTIATE_GRAD_ERR_TYPE(X_T, Y_T, JAC_T) template class GradientChecker<X_T, Y_T, JAC_T>;
 
 INSTANTIATE_GRAD_ERR_TYPE(float, float, float);
 INSTANTIATE_GRAD_ERR_TYPE(double, double, double);
 
+#undef INSTANTIATE_GRAD_ERR_TYPE
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/test/gradient/gradient_checker.h b/orttraining/orttraining/test/gradient/gradient_checker.h
index 6a857c298e..872d08ab27 100644
--- a/orttraining/orttraining/test/gradient/gradient_checker.h
+++ b/orttraining/orttraining/test/gradient/gradient_checker.h
@@ -23,8 +23,7 @@ namespace onnxruntime {
 namespace test {
 
 struct TensorInfo {
-  TensorInfo(std::initializer_list<int64_t> shape_init,
-             bool has_gradient = true,
+  TensorInfo(std::initializer_list<int64_t> shape_init, bool has_gradient = true,
              std::function<float(float)>* transformer = nullptr,
              MLDataType data_type = DataTypeImpl::GetTensorType<float>(),
              const std::vector<std::string>& dim_params = std::vector<std::string>{})
@@ -34,9 +33,7 @@ struct TensorInfo {
         data_type(data_type),
         dim_params(dim_params) {}
 
-  TensorInfo(const TensorShape& shape,
-             bool has_gradient = true,
-             std::function<float(float)>* transformer = nullptr,
+  TensorInfo(const TensorShape& shape, bool has_gradient = true, std::function<float(float)>* transformer = nullptr,
              MLDataType data_type = DataTypeImpl::GetTensorType<float>())
       : shape(shape), has_gradient(has_gradient), transformer(transformer), data_type(data_type) {}
 
@@ -66,89 +63,71 @@ class GradientChecker {
   ///
   /// if y = Square(x), where x (and so y) are DT_DOUBLE,
   /// <X_T, Y_T, JAC_T> should be <double, double, double>
-  Status ComputeGradientError(
-      const training::OpDef& op_def,
-      const std::vector<TensorInfo>& x_infos,
-      const std::vector<TensorInfo>& y_infos,
-      JAC_T* max_error,
-      const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes = {},
-      // TODO: Ideally it shall check for not has_gradient cases. But some tests are failing
-      // because the gradient op does not handle the case. We have to use this flag
-      // to disable check for not having gradient cases in order to pass those test.
-      // Remove this flag when the gradient op is fixed.
-      bool check_not_have_gradient = true,
-      // Also check gradient builder for op for cases where input shapes are not available
-      bool check_not_have_shape_inferencing = false,
-      std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
+  Status ComputeGradientError(const training::OpDef& op_def, const std::vector<TensorInfo>& x_infos,
+                              const std::vector<TensorInfo>& y_infos, JAC_T* max_error,
+                              const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes = {},
+                              // TODO: Ideally it shall check for not has_gradient cases. But some tests are failing
+                              // because the gradient op does not handle the case. We have to use this flag
+                              // to disable check for not having gradient cases in order to pass those test.
+                              // Remove this flag when the gradient op is fixed.
+                              bool check_not_have_gradient = true,
+                              // Also check gradient builder for op for cases where input shapes are not available
+                              bool check_not_have_shape_inferencing = false,
+                              std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
 
-  Status ComputeGradientError(
-      const training::OpDef& op_def,
-      const std::vector<TensorInfo>& x_infos,
-      const std::vector<TensorInfo>& y_infos,
-      JAC_T* max_error,
-      std::vector<std::vector<X_T>> x_datas,
-      const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes = {},
-      // TODO: Ideally it shall check for not has_gradient cases. But some tests are failing
-      // because the gradient op does not handle the case. We have to use this flag
-      // to disable check for not having gradient cases in order to pass those test.
-      // Remove this flag when the gradient op is fixed.
-      bool check_not_have_gradient = true,
-      // Also check gradient builder for op for cases where input shapes are not available
-      bool check_not_have_shape_inferencing = false,
-      std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
+  Status ComputeGradientError(const training::OpDef& op_def, const std::vector<TensorInfo>& x_infos,
+                              const std::vector<TensorInfo>& y_infos, JAC_T* max_error,
+                              std::vector<std::vector<X_T>> x_datas,
+                              const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes = {},
+                              // TODO: Ideally it shall check for not has_gradient cases. But some tests are failing
+                              // because the gradient op does not handle the case. We have to use this flag
+                              // to disable check for not having gradient cases in order to pass those test.
+                              // Remove this flag when the gradient op is fixed.
+                              bool check_not_have_gradient = true,
+                              // Also check gradient builder for op for cases where input shapes are not available
+                              bool check_not_have_shape_inferencing = false,
+                              std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
 
  private:
-  Status InitJacobians(const std::vector<TensorInfo>& x_infos,
-                       const std::vector<TensorInfo>& y_infos,
-                       std::vector<std::vector<JAC_T>>* jacobians);
+  void InitJacobians(size_t row_count, size_t col_count, std::vector<std::vector<JAC_T>>* jacobians);
 
-  std::vector<OrtValue> EvaluateFunctionAtInput(OpTester& op_tester,
-                                                const std::vector<TensorInfo>& x_infos,
+  void AddDatas(OpTester& op_session, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+                std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas);
+
+  std::vector<OrtValue> EvaluateFunctionAtInput(OpTester& op_tester, const std::vector<TensorInfo>& x_infos,
                                                 const std::vector<TensorInfo>& y_infos,
                                                 std::vector<std::vector<X_T>>* x_datas,
                                                 std::vector<std::vector<Y_T>>* y_datas);
 
-  Status InitOpTesterWithGraph(OpTester& op_tester,
-                               const std::vector<TensorInfo>& x_infos,
-                               const std::vector<TensorInfo>& y_infos,
-                               std::vector<std::vector<X_T>>* x_datas,
+  Status InitOpTesterWithGraph(OpTester& op_tester, const std::vector<TensorInfo>& x_infos,
+                               const std::vector<TensorInfo>& y_infos, std::vector<std::vector<X_T>>* x_datas,
                                std::vector<std::vector<Y_T>>* y_datas,
                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
                                const std::unordered_map<std::string, int>& extra_domain_to_version = {});
 
-  Status InitOpTesterWithGradGraph(OpTester& op_tester,
-                                   const std::vector<TensorInfo>& x_infos,
-                                   const std::vector<TensorInfo>& y_infos,
-                                   std::vector<std::vector<X_T>>* x_datas,
+  Status InitOpTesterWithGradGraph(OpTester& op_tester, const std::vector<TensorInfo>& x_infos,
+                                   const std::vector<TensorInfo>& y_infos, std::vector<std::vector<X_T>>* x_datas,
                                    std::vector<std::vector<Y_T>>* y_datas,
                                    const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes);
 
-  Status ComputeTheoreticalJacobianTranspose(const training::OpDef& op_def,
-                                             const std::vector<TensorInfo>& x_infos,
-                                             const std::vector<TensorInfo>& y_infos,
-                                             std::vector<std::vector<X_T>>* x_datas,
-                                             std::vector<std::vector<Y_T>>* y_datas,
-                                             std::vector<std::vector<JAC_T>>* jacobian_ts,
-                                             const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
-                                             bool add_shape = true,
-                                             std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
+  Status ComputeTheoreticalJacobianTranspose(
+      const training::OpDef& op_def, const std::vector<TensorInfo>& x_infos, const std::vector<TensorInfo>& y_infos,
+      std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
+      std::vector<std::vector<JAC_T>>* jacobian_ts, const std::vector<size_t>& row_strides,
+      const std::vector<size_t>& col_strides, const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
+      bool add_shape = true, std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr);
 
-  Status ComputeNumericJacobianTranspose(const training::OpDef& op_def,
-                                         const std::vector<TensorInfo>& x_infos,
-                                         const std::vector<TensorInfo>& y_infos,
-                                         const JAC_T delta,
-                                         std::vector<std::vector<X_T>>* x_datas,
-                                         std::vector<std::vector<Y_T>>* y_datas,
+  Status ComputeNumericJacobianTranspose(const training::OpDef& op_def, const std::vector<TensorInfo>& x_infos,
+                                         const std::vector<TensorInfo>& y_infos, const JAC_T delta,
+                                         std::vector<std::vector<X_T>>* x_datas, std::vector<std::vector<Y_T>>* y_datas,
                                          std::vector<std::vector<JAC_T>>* jacobian_ts,
+                                         const std::vector<size_t>& row_strides, const std::vector<size_t>& col_strides,
                                          const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
                                          bool add_shape = true);
 
-  Status ComputeGradientErrorInternal(const training::OpDef& op_name,
-                                      const std::vector<TensorInfo>& x_infos,
-                                      const std::vector<TensorInfo>& y_infos,
-                                      std::vector<std::vector<X_T>>* x_datas,
-                                      std::vector<std::vector<Y_T>>* y_datas,
-                                      JAC_T* max_error,
+  Status ComputeGradientErrorInternal(const training::OpDef& op_name, const std::vector<TensorInfo>& x_infos,
+                                      const std::vector<TensorInfo>& y_infos, std::vector<std::vector<X_T>>* x_datas,
+                                      std::vector<std::vector<Y_T>>* y_datas, JAC_T* max_error,
                                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attributes,
                                       bool check_not_have_gradient = true,
                                       bool check_not_have_shape_inferencing = false,
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 1af5b18072..3c803748c7 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -21,6 +21,15 @@
 
 #include "onnx/defs/attr_proto_util.h"
 
+/**
+ * The GradientChecker will compute numeric Jacobian and theoretical Jacobian for comparison during the test.
+ * The nemeric Jacobian is computed on every single input element by running the forward graph using OpTester twice.
+ * The theoretical Jacobian is computed on every single output element by running both forward and backward graph
+ * using OpTester. I.E., if there are M elements in inputs and N elements in outputs, it will run forward graph
+ * (M * 2 + N) times and backward graph N times using OpTester, which is super time consuming. So please keep the
+ * size of inputs and outputs small in the tests.
+ */
+
 namespace onnxruntime {
 namespace test {
 
@@ -31,58 +40,28 @@ static bool IsErrorWithinTolerance(float error, float tolerance) {
   return !std::isnan(error) && !std::isnan(tolerance) && error <= tolerance;
 }
 
-#define EXPECT_IS_TINIER_THAN(max_error, tolerance)         \
-  EXPECT_TRUE(IsErrorWithinTolerance(max_error, tolerance)) \
-      << "max_error: " << max_error                         \
-      << "; tolerance: " << tolerance                       \
+#define EXPECT_IS_TINIER_THAN(max_error, tolerance)                 \
+  EXPECT_TRUE(IsErrorWithinTolerance(max_error, tolerance))         \
+      << "max_error: " << max_error << "; tolerance: " << tolerance \
       << "; ORT test random seed: " << GetTestRandomSeed() << "; "
 
-#define EXPECT_IS_TINY(max_error) \
-  EXPECT_IS_TINIER_THAN(max_error, 1.5e-2f)
+#define EXPECT_IS_TINY(max_error) EXPECT_IS_TINIER_THAN(max_error, 1.5e-2f)
 
-static void RunReductionTests(const OpDef& op_def,
-                              bool axes_as_input = false,
+static void RunReductionTests(const OpDef& op_def, bool axes_as_input = false,
                               bool check_not_have_shape_inferencing = false) {
-  std::vector<std::vector<int64_t>>
-      x_shapes = {
-          {4, 3, 2},
-          {4, 3, 2},
-          {4, 3, 2},
-          {4, 3, 2},
-          {4, 3, 2},
-          {4, 3, 2},
-          {4, 3, 2},
-          {4, 3, 2},
-      };
+  std::vector<std::vector<int64_t>> x_shapes = {
+      {4, 3, 2}, {4, 3, 2}, {4, 3, 2}, {4, 3, 2}, {4, 3, 2}, {4, 3, 2}, {4, 3, 2}, {4, 3, 2},
+  };
   std::vector<std::vector<int64_t>> y_shapes = {
-      {1, 1, 1},
-      {},
-      {1, 3, 1},
-      {2},
-      {4, 1, 2},
-      {4, 3},
-      {4, 1, 2},
-      {4},
+      {1, 1, 1}, {}, {1, 3, 1}, {2}, {4, 1, 2}, {4, 3}, {4, 1, 2}, {4},
   };
   std::vector<std::vector<int64_t>> axes_vec = {
-      {},  //default case
-      {0, 1, 2},
-      {0, 2},
-      {0, 1},
-      {1},
-      {2},
-      {-2},
-      {-2, -1},
+      {},  // default case
+      {0, 1, 2}, {0, 2}, {0, 1}, {1}, {2}, {-2}, {-2, -1},
   };
   std::vector<int64_t> keepdims_ip = {
-      -1,  //default case
-      0,
-      1,
-      0,
-      1,
-      0,
-      1,
-      0,
+      -1,  // default case
+      0,  1, 0, 1, 0, 1, 0,
   };
 
   GradientChecker<float, float, float> gradient_checker;
@@ -102,26 +81,24 @@ static void RunReductionTests(const OpDef& op_def,
     if (axes_as_input) {
       std::vector<float> axes_float;
       axes_float.reserve(axes.size());
-      std::transform(std::begin(axes), std::end(axes), std::back_inserter(axes_float), [](int64_t i) { return static_cast<float>(i); });
+      std::transform(std::begin(axes), std::end(axes), std::back_inserter(axes_float),
+                     [](int64_t i) { return static_cast<float>(i); });
       TensorInfo axes_info({static_cast<int64_t>(axes.size())}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
       input.push_back(axes_info);
       x_datas.push_back(axes_float);
     } else {
-      if (axes.size() > 0)
-        attributes.push_back(MakeAttribute("axes", axes));
+      if (axes.size() > 0) attributes.push_back(MakeAttribute("axes", axes));
     }
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, input, {y_shape}, &max_error, x_datas,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, input, {y_shape}, &max_error, x_datas, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 }
 
 template <typename T>
-void GenerateRandomDataWithOneHot(
-    std::vector<std::vector<float>>& x_datas,
-    std::vector<TensorShape> input_shapes,
-    const std::unordered_set<int>& one_hot_input_indices) {
+void GenerateRandomDataWithOneHot(std::vector<std::vector<float>>& x_datas, std::vector<TensorShape> input_shapes,
+                                  const std::unordered_set<int>& one_hot_input_indices) {
   for (int i = 0; i < 2; i++) {
     // TODO: Consider varying mean and variance
     float scale = 5.f;
@@ -139,15 +116,15 @@ void GenerateRandomDataWithOneHot(
       int64_t D = input_shapes[i][input_shapes[i].NumDimensions() - 1];
 
       std::fill(x_datas[i].begin(), x_datas[i].end(), (T)0);
-      for (int64_t k = 0; k < N; k++)
-        x_datas[i][k * D + (seed % D)] = (T)1;
+      for (int64_t k = 0; k < N; k++) x_datas[i][k * D + (seed % D)] = (T)1;
     } else {
       std::generate(x_datas[i].begin(), x_datas[i].end(), [&] { return distribution(generator); });
     }
   }
 }
 
-void UnaryOpGradientTest(const std::string& op_type, const std::string& domain = kOnnxDomain, const int opset_version = 9,
+void UnaryOpGradientTest(const std::string& op_type, const std::string& domain = kOnnxDomain,
+                         const int opset_version = 9,
                          std::vector<std::unique_ptr<IExecutionProvider>>* execution_providers = nullptr) {
   TensorShape shape({2, 3, 4});
   float max_error;
@@ -161,9 +138,7 @@ void UnaryOpGradientTest(const std::string& op_type, const std::string& domain =
   EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
 }
 
-TEST(GradientCheckerTest, ErfGrad) {
-  UnaryOpGradientTest("Erf");
-}
+TEST(GradientCheckerTest, ErfGrad) { UnaryOpGradientTest("Erf"); }
 
 TEST(GradientCheckerTest, SqrtGrad) {
   TensorShape shape({2, 3, 4});
@@ -181,127 +156,128 @@ TEST(GradientCheckerTest, SqrtGrad) {
   EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
 }
 
-void RunBroadcastableBinaryOpGradTests(const OpDef& op_def,
-                                       std::function<float(float)>* transformer,
+void RunBroadcastableBinaryOpGradTests(const OpDef& op_def, std::function<float(float)>* transformer,
                                        bool check_not_have_shape_inferencing) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
   const std::vector<ONNX_NAMESPACE::AttributeProto> attributes = {};
 
-  //shape(A) = (2, 3, 4, 5), shape(B) = (2, 3, 4, 5), ==> shape(result) = (2, 3, 4, 5)
+  // shape(A) = (2, 3, 2, 3), shape(B) = (2, 3, 2, 3), ==> shape(result) = (2, 3, 2, 3)
   {
-    TensorInfo A_info{{2, 3, 4, 5}, true, transformer};
-    TensorInfo B_info{{2, 3, 4, 5}, true, transformer};
-    TensorInfo Y_info{{2, 3, 4, 5}};
+    TensorInfo A_info{{2, 3, 2, 3}, true, transformer};
+    TensorInfo B_info{{2, 3, 2, 3}, true, transformer};
+    TensorInfo Y_info{{2, 3, 2, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //shape(A) = (2, 3, 4, 5), shape(B) = (,), i.e. B is a scalar ==> shape(result) = (2, 3, 4, 5)
+  // shape(A) = (2, 3, 2, 3), shape(B) = (,), i.e. B is a scalar ==> shape(result) = (2, 3, 2, 3)
   {
-    TensorInfo A_info{{2, 3, 4, 5}, true, transformer};
+    TensorInfo A_info{{2, 3, 2, 3}, true, transformer};
     TensorInfo B_info{{}, true, transformer};
-    TensorInfo Y_info{{2, 3, 4, 5}};
+    TensorInfo Y_info{{2, 3, 2, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //shape(A) = (,), shape(B) = (2, 3, 4, 5), i.e. A is a scalar ==> shape(result) = (2, 3, 4, 5)
+  // shape(A) = (,), shape(B) = (2, 3, 2, 3), i.e. A is a scalar ==> shape(result) = (2, 3, 2, 3)
   {
     TensorInfo A_info{{}, true, transformer};
-    TensorInfo B_info{{2, 3, 4, 5}, true, transformer};
-    TensorInfo Y_info{{2, 3, 4, 5}};
+    TensorInfo B_info{{2, 3, 2, 3}, true, transformer};
+    TensorInfo Y_info{{2, 3, 2, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //shape(A) = (2, 3, 4, 5), shape(B) = (5,), ==> shape(result) = (2, 3, 4, 5)
+  // shape(A) = (2, 3, 2, 3), shape(B) = (3,), ==> shape(result) = (2, 3, 2, 3)
   {
-    TensorInfo A_info{{2, 3, 4, 5}, true, transformer};
-    TensorInfo B_info{{5}, true, transformer};
-    TensorInfo Y_info{{2, 3, 4, 5}};
+    TensorInfo A_info{{2, 3, 2, 3}, true, transformer};
+    TensorInfo B_info{{3}, true, transformer};
+    TensorInfo Y_info{{2, 3, 2, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //shape(A) = (4, 5), shape(B) = (2, 3, 4, 5), ==> shape(result) = (2, 3, 4, 5)
+  // shape(A) = (2, 3), shape(B) = (2, 3, 2, 3), ==> shape(result) = (2, 3, 2, 3)
   {
-    TensorInfo A_info{{4, 5}, true, transformer};
-    TensorInfo B_info{{2, 3, 4, 5}, true, transformer};
-    TensorInfo Y_info{{2, 3, 4, 5}};
+    TensorInfo A_info{{2, 3}, true, transformer};
+    TensorInfo B_info{{2, 3, 2, 3}, true, transformer};
+    TensorInfo Y_info{{2, 3, 2, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //shape(A) = (1, 4, 5), shape(B) = (2, 3, 1, 1), ==> shape(result) = (2, 3, 4, 5)
+  // shape(A) = (1, 2, 3), shape(B) = (2, 3, 1, 1), ==> shape(result) = (2, 3, 2, 3)
   {
-    TensorInfo A_info{{1, 4, 5}, true, transformer};
+    TensorInfo A_info{{1, 2, 3}, true, transformer};
     TensorInfo B_info{{2, 3, 1, 1}, true, transformer};
-    TensorInfo Y_info{{2, 3, 4, 5}};
+    TensorInfo Y_info{{2, 3, 2, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //shape(A) = (3, 4, 5), shape(B) = (2, 1, 1, 1), ==> shape(result) = (2, 3, 4, 5)
+  // shape(A) = (3, 2, 3), shape(B) = (2, 1, 1, 1), ==> shape(result) = (2, 3, 2, 3)
   {
-    TensorInfo A_info{{3, 4, 5}, true, transformer};
+    TensorInfo A_info{{3, 2, 3}, true, transformer};
     TensorInfo B_info{{2, 1, 1, 1}, true, transformer};
-    TensorInfo Y_info{{2, 3, 4, 5}};
+    TensorInfo Y_info{{2, 3, 2, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //shape(A) = (2, 1, 1, 5), shape(B) = (1, 3, 4, 1), ==> shape(result) = (2, 3, 4, 5)
+  // shape(A) = (2, 1, 1, 3), shape(B) = (1, 3, 2, 1), ==> shape(result) = (2, 3, 2, 3)
   {
-    TensorInfo A_info{{2, 1, 1, 5}, true, transformer};
-    TensorInfo B_info{{1, 3, 4, 1}, true, transformer};
-    TensorInfo Y_info{{2, 3, 4, 5}};
+    TensorInfo A_info{{2, 1, 1, 3}, true, transformer};
+    TensorInfo B_info{{1, 3, 2, 1}, true, transformer};
+    TensorInfo Y_info{{2, 3, 2, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
   // symbolic broadcast
-  // shape(A) = (4, 2, 1, "seq(3)"), shape(B) = (4, 2, 1, 1), ==> shape(result) = (4, 2, 1, 3)
+  // shape(A) = (3, 2, 1, "seq(3)"), shape(B) = (3, 2, 1, 1), ==> shape(result) = (3, 2, 1, 3)
   {
-    TensorInfo A_info{{4, 2, 1, 3}, true, transformer, DataTypeImpl::GetTensorType<float>(), {"4", "2", "1", "seq"}};
-    TensorInfo B_info{{4, 2, 1, 1}, true, transformer, DataTypeImpl::GetTensorType<float>(), {"4", "2", "1", "1"}};
-    TensorInfo Y_info{{4, 2, 1, 3}};
+    TensorInfo A_info{{3, 2, 1, 3}, true, transformer, DataTypeImpl::GetTensorType<float>(), {"3", "2", "1", "seq"}};
+    TensorInfo B_info{{3, 2, 1, 1}, true, transformer, DataTypeImpl::GetTensorType<float>(), {"3", "2", "1", "1"}};
+    TensorInfo Y_info{{3, 2, 1, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
   // symbolic broadcast + numeric broadcast
-  // shape(A) = ("batch(4)", 2, "seq(3)", "seq(3)"), shape(B) = ("batch(4)", 1, "seq(3)", "seq(3)"), ==> shape(result) = (4, 2, 3, 3)
+  // shape(A) = ("batch(3)", 2, "seq(3)", "seq(3)"), shape(B) = ("batch(3)", 1, 1, "seq(3)"), ==> shape(result) =
+  // (3, 2, 3, 3)
   {
-    TensorInfo A_info{{4, 2, 3, 3}, true, transformer, DataTypeImpl::GetTensorType<float>(), {"batch", "2", "seq", "seq"}};
-    TensorInfo B_info{{4, 1, 1, 3}, true, transformer, DataTypeImpl::GetTensorType<float>(), {"batch", "1", "1", "seq"}};
-    TensorInfo Y_info{{4, 2, 3, 3}};
+    TensorInfo A_info{
+        {3, 2, 3, 3}, true, transformer, DataTypeImpl::GetTensorType<float>(), {"batch", "2", "seq", "seq"}};
+    TensorInfo B_info{
+        {3, 1, 1, 3}, true, transformer, DataTypeImpl::GetTensorType<float>(), {"batch", "1", "1", "seq"}};
+    TensorInfo Y_info{{3, 2, 3, 3}};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error,
-                                                           attributes, true, check_not_have_shape_inferencing));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {A_info, B_info}, {Y_info}, &max_error, attributes,
+                                                           true, check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 }
 
-void TestBroadcastableBinaryOpGrad(const std::string& op_type,
-                                   std::function<float(float)>* transformer = nullptr,
+void TestBroadcastableBinaryOpGrad(const std::string& op_type, std::function<float(float)>* transformer = nullptr,
                                    bool check_not_have_shape_inferencing = true) {
   OpDef op_def_opset11{op_type, kOnnxDomain, 11};
   RunBroadcastableBinaryOpGradTests(op_def_opset11, transformer, check_not_have_shape_inferencing);
@@ -309,18 +285,12 @@ void TestBroadcastableBinaryOpGrad(const std::string& op_type,
   RunBroadcastableBinaryOpGradTests(op_def_opset13, transformer, check_not_have_shape_inferencing);
 }
 
-TEST(GradientCheckerTest, AddGrad) {
-  TestBroadcastableBinaryOpGrad("Add");
-}
+TEST(GradientCheckerTest, AddGrad) { TestBroadcastableBinaryOpGrad("Add"); }
 
-TEST(GradientCheckerTest, SubGrad) {
-  TestBroadcastableBinaryOpGrad("Sub");
-}
+TEST(GradientCheckerTest, SubGrad) { TestBroadcastableBinaryOpGrad("Sub"); }
 
-//flaky
-TEST(GradientCheckerTest, DISABLED_MulGrad) {
-  TestBroadcastableBinaryOpGrad("Mul");
-}
+// flaky
+TEST(GradientCheckerTest, DISABLED_MulGrad) { TestBroadcastableBinaryOpGrad("Mul"); }
 
 TEST(GradientCheckerTest, DivGrad) {
   std::function<float(float)> transformer = [](float x) { return x > 0 ? x + 0.2f : x - 0.2f; };
@@ -362,57 +332,57 @@ void RunMatMulGradTests(const OpDef& op_def) {
 
   // 2D x 2D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}}, {{2, 3}}, &max_error,
-                                                           attributes, true, true));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {{2, 2}, {2, 3}}, {{2, 3}}, &max_error, attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // 3D x 3D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 4}, {2, 4, 3}}, {{2, 3, 3}}, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 2}, {2, 2, 3}}, {{2, 3, 3}}, &max_error,
                                                            attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // 3D x 2D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 4}, {4, 3}}, {{2, 3, 3}}, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 2}, {2, 3}}, {{2, 3, 3}}, &max_error,
                                                            attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // 2D x 3D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{3, 4}, {2, 4, 3}}, {{2, 3, 3}}, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{3, 2}, {2, 2, 3}}, {{2, 3, 3}}, &max_error,
                                                            attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // 4D x 4D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 4, 5}, {2, 3, 5, 4}}, {{2, 3, 4, 4}}, &max_error,
-                                                           attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 3, 2}, {2, 3, 2, 3}}, {{2, 3, 3, 3}},
+                                                           &max_error, attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // 4D x 2D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 4, 5}, {5, 4}}, {{2, 3, 4, 4}}, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 3, 2}, {2, 3}}, {{2, 3, 3, 3}}, &max_error,
                                                            attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // 4D x 3D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 4, 5}, {3, 5, 4}}, {{2, 3, 4, 4}}, &max_error,
-                                                           attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 3, 2}, {3, 2, 3}}, {{2, 3, 3, 3}},
+                                                           &max_error, attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // 4D x 4D with broadcast
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 1, 4, 5}, {1, 3, 5, 4}}, {{2, 3, 4, 4}}, &max_error,
-                                                           attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 1, 3, 2}, {1, 3, 2, 3}}, {{2, 3, 3, 3}},
+                                                           &max_error, attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 }
@@ -424,20 +394,14 @@ TEST(GradientCheckerTest, MatMulGrad) {
   RunMatMulGradTests(op_def_opset13);
 }
 
-TEST(GradientCheckerTest, SinGrad) {
-  UnaryOpGradientTest("Sin");
-}
+TEST(GradientCheckerTest, SinGrad) { UnaryOpGradientTest("Sin"); }
 
-TEST(GradientCheckerTest, NegGrad) {
-  UnaryOpGradientTest("Neg");
-}
+TEST(GradientCheckerTest, NegGrad) { UnaryOpGradientTest("Neg"); }
 
-TEST(GradientCheckerTest, AbsGrad) {
-  UnaryOpGradientTest("Abs");
-}
+TEST(GradientCheckerTest, AbsGrad) { UnaryOpGradientTest("Abs"); }
 
 TEST(GradientCheckerTest, LogGrad) {
-  TensorShape shape({2, 5, 6});
+  TensorShape shape({2, 3, 4});
 
   std::function<float(float)> transformer = [](float x) { return std::fabs(x) + 1e-1f; };
   TensorInfo x_info{shape, true, &transformer};
@@ -478,32 +442,25 @@ TEST(GradientCheckerTest, ExpGrad) {
 }
 
 TEST(GradientCheckerTest, FlattenGrad) {
-  TensorShape shape({2, 3, 4});
+  TensorShape shape({2, 3, 2});
   float max_error;
   float error_tolerance = 1e-3f;
   GradientChecker<float, float, float> gradient_checker;
   OpDef op_def{"Flatten", kOnnxDomain, 11};
 
   const std::vector<std::pair<int, TensorShape>> axis_to_shape = {
-      {-3, {1, 24}},
-      {-2, {2, 12}},
-      {-1, {6, 4}},
-      {0, {1, 24}},
-      {1, {2, 12}},
-      {2, {6, 4}},
-      {3, {24, 1}}};
+      {-3, {1, 12}}, {-2, {2, 6}}, {-1, {6, 2}}, {0, {1, 12}}, {1, {2, 6}}, {2, {6, 2}}, {3, {12, 1}}};
 
   for (auto& pair : axis_to_shape) {
     int axis = pair.first;
     const TensorShape& output_shape = pair.second;
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {output_shape}, &max_error, {MakeAttribute("axis", int64_t(axis))}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {output_shape}, &max_error,
+                                                           {MakeAttribute("axis", int64_t(axis))}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 }
 
-TEST(GradientCheckerTest, TanhGrad) {
-  UnaryOpGradientTest("Tanh");
-}
+TEST(GradientCheckerTest, TanhGrad) { UnaryOpGradientTest("Tanh"); }
 
 // TODO fix flaky test
 // failing random seed with error_tolerance of 1.5e-2f: 322298223
@@ -515,43 +472,50 @@ void RunGemmGradTests(const OpDef& op_def) {
 
   // Single Batch no third input
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 4}, {4, 3}}, {{1, 3}}, &max_error, attributes, true, true));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {{1, 3}, {3, 2}}, {{1, 2}}, &max_error, attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // Single Batch with Scalar Bias
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 4}, {4, 3}, {}}, {{1, 3}}, &max_error, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 3}, {3, 2}, {}}, {{1, 2}}, &max_error,
+                                                           attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // Single Batch with Vector Bias
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 4}, {4, 3}, {3}}, {{1, 3}}, &max_error, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 3}, {3, 2}, {2}}, {{1, 2}}, &max_error,
+                                                           attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // Non-Single Batch with Scalar Bias
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {}}, {{2, 3}}, &max_error, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {}}, {{2, 3}}, &max_error,
+                                                           attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // Non-Single Batch with Vector Bias
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {3}}, {{2, 3}}, &max_error, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {3}}, {{2, 3}}, &max_error,
+                                                           attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // Non-Single Batch with Broadcast Bias
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {1, 3}}, {{2, 3}}, &max_error, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {1, 3}}, {{2, 3}}, &max_error,
+                                                           attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // Non-Single Batch with Non-BroadcastBias
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {2, 3}}, {{2, 3}}, &max_error, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {2, 3}}, {{2, 3}}, &max_error,
+                                                           attributes, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -571,18 +535,16 @@ void RunGemmGradTests(const OpDef& op_def) {
 
   // TransA and TransB
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{4, 2}, {3, 4}, {3}}, {{2, 3}}, &max_error,
-                                                           {MakeAttribute("transA", int64_t(1)),
-                                                            MakeAttribute("transB", int64_t(1))},
-                                                           true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {{4, 2}, {3, 4}, {3}}, {{2, 3}}, &max_error,
+        {MakeAttribute("transA", int64_t(1)), MakeAttribute("transB", int64_t(1))}, true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
   // alpha and beta + no_broadcast
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {2, 3}}, {{2, 3}}, &max_error,
-                                                           {MakeAttribute("alpha", 0.7f),
-                                                            MakeAttribute("beta", 5.0f)},
+                                                           {MakeAttribute("alpha", 0.7f), MakeAttribute("beta", 5.0f)},
                                                            true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
@@ -590,8 +552,7 @@ void RunGemmGradTests(const OpDef& op_def) {
   // alpha and beta + broadcast
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 4}, {4, 3}, {3}}, {{2, 3}}, &max_error,
-                                                           {MakeAttribute("alpha", 0.7f),
-                                                            MakeAttribute("beta", 5.0f)},
+                                                           {MakeAttribute("alpha", 0.7f), MakeAttribute("beta", 5.0f)},
                                                            true, true));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
@@ -681,8 +642,8 @@ TEST(GradientCheckerTest, CastGrad) {
     GradientChecker<float, float, float> gradient_checker;
     OpDef op_def{"Cast"};
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error,
-                                                           {MakeAttribute("to", int64_t(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT))}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {shape}, {shape}, &max_error, {MakeAttribute("to", int64_t(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT))}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 }
@@ -697,7 +658,7 @@ TEST(GradientCheckerTest, SplitGrad) {
                                                          {MakeAttribute("axis", int64_t(0))}));
   EXPECT_IS_TINY(max_error);
 
-  //opset13 test
+  // opset13 test
   OpDef op_def_13{"Split", kOnnxDomain, 13};
   ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_13, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error,
                                                          {MakeAttribute("axis", int64_t(0))}));
@@ -733,7 +694,7 @@ TEST(GradientCheckerTest, MaxPoolGrad) {
   GradientChecker<float, float, float> gradient_checker;
   OpDef op_def{"MaxPool"};
   constexpr float error_tolerance = 1e-3f;
-  //maxpool_1d_default
+  // maxpool_1d_default
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 2, 9}}, {{2, 2, 8}}, &max_error,
                                                            GetRandomValuesForMaxPool<float>({{2, 2, 9}}),
@@ -741,7 +702,7 @@ TEST(GradientCheckerTest, MaxPoolGrad) {
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
-  //maxpool_2d_default
+  // maxpool_2d_default
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 5, 5}}, {{2, 3, 4, 4}}, &max_error,
                                                            GetRandomValuesForMaxPool<float>({{2, 3, 5, 5}}),
@@ -759,7 +720,7 @@ TEST(GradientCheckerTest, MaxPoolGrad) {
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
-  //maxpool_2d_strides
+  // maxpool_2d_strides
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 1, 32, 32}}, {{1, 1, 10, 10}}, &max_error,
                                                            GetRandomValuesForMaxPool<float>({{1, 1, 32, 32}}),
@@ -768,11 +729,11 @@ TEST(GradientCheckerTest, MaxPoolGrad) {
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
-  //maxpool_3d_default
+  // maxpool_3d_default
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 1, 3, 3, 3}}, {{2, 1, 2, 2, 2}}, &max_error,
-                                                           GetRandomValuesForMaxPool<float>({{2, 1, 3, 3, 3}}),
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2, 2})}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {{2, 1, 3, 3, 3}}, {{2, 1, 2, 2, 2}}, &max_error, GetRandomValuesForMaxPool<float>({{2, 1, 3, 3, 3}}),
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2, 2})}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 }
@@ -783,7 +744,7 @@ TEST(GradientCheckerTest, GlobalAveragePoolGrad) {
   OpDef op_def{"GlobalAveragePool"};
   constexpr float error_tolerance = 1e-3f;
 
-  //globalaveragepool
+  // globalaveragepool
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 5, 5}}, {{2, 3, 1, 1}}, &max_error, {},
                                                            /*check_not_have_gradient*/ true,
@@ -791,7 +752,7 @@ TEST(GradientCheckerTest, GlobalAveragePoolGrad) {
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
-  //globalaveragepool_precomputed
+  // globalaveragepool_precomputed
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 1, 3, 3}}, {{2, 1, 1, 1}}, &max_error, {},
                                                            /*check_not_have_gradient*/ true,
@@ -814,14 +775,12 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({2, 2, 3});
     TensorShape b_shape({2});
     TensorShape y_shape({2, 2, 5});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{3}), MakeAttribute("pads", std::vector<int64_t>{1, 1})},
+        // TODO: ConvGrad does not handle the case where W does not have gradient.
+        // Check for not has_gradient need to be disabled to pass this test.
+        false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -831,15 +790,13 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 3});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 4});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{2})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{3}), MakeAttribute("pads", std::vector<int64_t>{1, 1}),
+         MakeAttribute("strides", std::vector<int64_t>{2})},
+        // TODO: ConvGrad does not handle the case where W does not have gradient.
+        // Check for not has_gradient need to be disabled to pass this test.
+        false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -849,14 +806,12 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 1});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 7});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{1}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{1}), MakeAttribute("pads", std::vector<int64_t>{1, 1})},
+        // TODO: ConvGrad does not handle the case where W does not have gradient.
+        // Check for not has_gradient need to be disabled to pass this test.
+        false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -866,14 +821,12 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 1});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 5});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{1}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{0, 0})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{1}), MakeAttribute("pads", std::vector<int64_t>{0, 0})},
+        // TODO: ConvGrad does not handle the case where W does not have gradient.
+        // Check for not has_gradient need to be disabled to pass this test.
+        false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -883,14 +836,13 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 3, 3});
     TensorShape b_shape({1});
     TensorShape y_shape({1, 1, 3, 3});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                                               MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1})},
+                                              // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                              // Check for not has_gradient need to be disabled to pass this test.
+                                              false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -900,14 +852,13 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 3, 3});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 5, 5});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                                               MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1})},
+                                              // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                              // Check for not has_gradient need to be disabled to pass this test.
+                                              false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -917,14 +868,13 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 1, 1});
     TensorShape b_shape({1});
     TensorShape y_shape({1, 1, 3, 3});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
+                                               MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1})},
+                                              // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                              // Check for not has_gradient need to be disabled to pass this test.
+                                              false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -934,14 +884,13 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 1, 1});
     TensorShape b_shape({1});
     TensorShape y_shape({1, 1, 1, 1});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
+                                               MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0})},
+                                              // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                              // Check for not has_gradient need to be disabled to pass this test.
+                                              false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -951,15 +900,13 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 3, 3});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 4, 3});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{2, 2})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+         MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1}), MakeAttribute("strides", std::vector<int64_t>{2, 2})},
+        // TODO: ConvGrad does not handle the case where W does not have gradient.
+        // Check for not has_gradient need to be disabled to pass this test.
+        false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -969,15 +916,14 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 3, 3});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 1, 1});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
-                                                            MakeAttribute("dilations", std::vector<int64_t>{2, 2})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                                               MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                                               MakeAttribute("dilations", std::vector<int64_t>{2, 2})},
+                                              // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                              // Check for not has_gradient need to be disabled to pass this test.
+                                              false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -987,15 +933,14 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 3, 3});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 5, 3});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1}),
-                                                            MakeAttribute("dilations", std::vector<int64_t>{2, 2})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                                               MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1}),
+                                               MakeAttribute("dilations", std::vector<int64_t>{2, 2})},
+                                              // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                              // Check for not has_gradient need to be disabled to pass this test.
+                                              false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1005,14 +950,13 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 3, 3, 3});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 5, 5, 5});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
+                                               MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1})},
+                                              // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                              // Check for not has_gradient need to be disabled to pass this test.
+                                              false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1022,15 +966,14 @@ void ConvGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvider>>* e
     TensorShape w_shape({1, 1, 3, 3, 3});
     TensorShape b_shape({1});
     TensorShape y_shape({2, 1, 4, 3, 3});
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{2, 2, 2})},
-                                                           // TODO: ConvGrad does not handle the case where W does not have gradient.
-                                                           // Check for not has_gradient need to be disabled to pass this test.
-                                                           false,
-                                                           false,
-                                                           execution_providers));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
+                                               MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1}),
+                                               MakeAttribute("strides", std::vector<int64_t>{2, 2, 2})},
+                                              // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                              // Check for not has_gradient need to be disabled to pass this test.
+                                              false, false, execution_providers));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 }
@@ -1051,77 +994,70 @@ TEST(GradientCheckerTest, ConvGrad) {
   ConvGradientCheckerTest(&execution_providers);
 }
 
-static void TestConcatOpGrad(const std::string& op_type,
-                             const std::string& domain = kOnnxDomain,
-                             int opset_version = 9,
+static void TestConcatOpGrad(const std::string& op_type, const std::string& domain = kOnnxDomain, int opset_version = 9,
                              bool check_not_have_shape_inferencing = false) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
   const bool extra_input = op_type == "ConcatTraining";
   OpDef op_def{op_type, domain, opset_version};
 
-  //concat_1d
+  // concat_1d
   {
     TensorShape x_shape({2});
     TensorShape y_shape({6});
     std::vector<TensorInfo> output = {y_shape};
     if (extra_input) output.push_back(TensorInfo({3}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>()));
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, x_shape, x_shape},
-                                                           output, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, x_shape, x_shape}, output, &max_error,
                                                            {MakeAttribute("axis", int64_t(0))}, true,
                                                            check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //concat_2d
+  // concat_2d
   {
     TensorShape x_shape({2, 2});
     TensorShape y_shape({2, 6});
     std::vector<TensorInfo> output = {y_shape};
     if (extra_input) output.push_back(TensorInfo({3}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>()));
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, x_shape, x_shape},
-                                                           output, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, x_shape, x_shape}, output, &max_error,
                                                            {MakeAttribute("axis", int64_t(1))}, true,
                                                            check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //concat_3d
+  // concat_3d
   {
     TensorShape x_shape({1, 2, 3});
     TensorShape y_shape({1, 2, 9});
     std::vector<TensorInfo> output = {y_shape};
     if (extra_input) output.push_back(TensorInfo({3}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>()));
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, x_shape, x_shape},
-                                                           output, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape, x_shape, x_shape}, output, &max_error,
                                                            {MakeAttribute("axis", int64_t(2))}, true,
                                                            check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //concat_different_shape
+  // concat_different_shape
   {
     TensorShape x1_shape({2, 2});
     TensorShape x2_shape({2, 4});
     TensorShape y_shape({2, 6});
     std::vector<TensorInfo> output = {y_shape};
     if (extra_input) output.push_back(TensorInfo({2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>()));
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x1_shape, x2_shape},
-                                                           output, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x1_shape, x2_shape}, output, &max_error,
                                                            {MakeAttribute("axis", int64_t(1))}, true,
                                                            check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
   }
 
-  //concat_different_shape_and_negative_axis
+  // concat_different_shape_and_negative_axis
   {
     TensorShape x1_shape({2, 2});
     TensorShape x2_shape({2, 4});
     TensorShape y_shape({2, 6});
     std::vector<TensorInfo> output = {y_shape};
     if (extra_input) output.push_back(TensorInfo({2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>()));
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x1_shape, x2_shape},
-                                                           output, &max_error,
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x1_shape, x2_shape}, output, &max_error,
                                                            {MakeAttribute("axis", int64_t(-1))}, true,
                                                            check_not_have_shape_inferencing));
     EXPECT_IS_TINY(max_error);
@@ -1143,97 +1079,91 @@ void AveragepoolGradientCheckerTest(std::vector<std::unique_ptr<IExecutionProvid
   GradientChecker<float, float, float> gradient_checker;
   OpDef op_def{"AveragePool"};
 
-  //averagepool - 1D
+  // averagepool - 1D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 8}}, {{2, 3, 4}}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{2}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{2})},
-                                                           true, false,
-                                                           execution_provider));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {{2, 3, 8}}, {{2, 3, 4}}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{2}), MakeAttribute("strides", std::vector<int64_t>{2})},
+        true, false, execution_provider));
 
     EXPECT_IS_TINY(max_error);
   }
 
-  //averagepool - 2D
+  // averagepool - 2D
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 8, 8}}, {{2, 3, 7, 7}}, &max_error,
                                                            {MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2}),
                                                             MakeAttribute("strides", std::vector<int64_t>{1, 1})},
-                                                           true, false,
-                                                           execution_provider));
+                                                           true, false, execution_provider));
 
     EXPECT_IS_TINY(max_error);
   }
 
-  //averagepool - 3D
+  // averagepool - 3D
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 8, 8, 8}}, {{2, 3, 4, 4, 4}}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2, 2}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{2, 2, 2})},
-                                                           true, false,
-                                                           execution_provider));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {{2, 3, 8, 8, 8}}, {{2, 3, 4, 4, 4}}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2, 2}),
+                                               MakeAttribute("strides", std::vector<int64_t>{2, 2, 2})},
+                                              true, false, execution_provider));
 
     EXPECT_IS_TINY(max_error);
   }
 
-  //averagepool - 1D - With padding
+  // averagepool - 1D - With padding
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 3, 8}}, {{1, 3, 3}}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 0})},
-                                                           true, false,
-                                                           execution_provider));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {{1, 3, 8}}, {{1, 3, 3}}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{3}), MakeAttribute("strides", std::vector<int64_t>{3}),
+         MakeAttribute("pads", std::vector<int64_t>{1, 0})},
+        true, false, execution_provider));
 
     EXPECT_IS_TINY(max_error);
   }
 
   // averagepool - 2D - With padding - include pads
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 3, 7, 8}}, {{1, 3, 3, 4}}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 2}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{3, 2}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 0, 1, 0}),
-                                                            MakeAttribute("count_include_pad", int64_t(1))},
-                                                           true, false,
-                                                           execution_provider));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {{1, 3, 7, 8}}, {{1, 3, 3, 4}}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 2}),
+         MakeAttribute("strides", std::vector<int64_t>{3, 2}), MakeAttribute("pads", std::vector<int64_t>{1, 0, 1, 0}),
+         MakeAttribute("count_include_pad", int64_t(1))},
+        true, false, execution_provider));
 
     EXPECT_IS_TINY(max_error);
   }
 
   // averagepool - 2D - With padding - exclude pads
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 3, 7, 7}}, {{1, 3, 3, 3}}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1})},
-                                                           true, false,
-                                                           execution_provider));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {{1, 3, 7, 7}}, {{1, 3, 3, 3}}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+         MakeAttribute("strides", std::vector<int64_t>{3, 3}), MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1})},
+        true, false, execution_provider));
 
     EXPECT_IS_TINY(max_error);
   }
 
-  //averagepool - 3D - With padding
+  // averagepool - 3D - With padding
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 3, 8, 8, 8}}, {{1, 3, 3, 3, 3}}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{3, 3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 0, 0, 0})},
-                                                           true, false,
-                                                           execution_provider));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {{1, 3, 8, 8, 8}}, {{1, 3, 3, 3, 3}}, &max_error,
+                                              {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
+                                               MakeAttribute("strides", std::vector<int64_t>{3, 3, 3}),
+                                               MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 0, 0, 0})},
+                                              true, false, execution_provider));
 
     EXPECT_IS_TINY(max_error);
   }
 
-  //averagepool - 3D - With padding- exclude pads
+  // averagepool - 3D - With padding- exclude pads
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{1, 4, 7, 7, 7}}, {{1, 4, 3, 3, 3}}, &max_error,
-                                                           {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
-                                                            MakeAttribute("strides", std::vector<int64_t>{3, 3, 3}),
-                                                            MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1}),
-                                                            MakeAttribute("count_include_pad", int64_t(1))},
-                                                           true, false,
-                                                           execution_provider));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {{1, 4, 7, 7, 7}}, {{1, 4, 3, 3, 3}}, &max_error,
+        {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
+         MakeAttribute("strides", std::vector<int64_t>{3, 3, 3}),
+         MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1}), MakeAttribute("count_include_pad", int64_t(1))},
+        true, false, execution_provider));
 
     EXPECT_IS_TINY(max_error);
   }
@@ -1246,7 +1176,7 @@ TEST(GradientCheckerTest, AveragePoolGrad) {
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultDnnlExecutionProvider());
   AveragepoolGradientCheckerTest(&execution_providers);
-#endif  //USE_DNNL
+#endif  // USE_DNNL
 }
 
 TEST(GradientCheckerTest, TransposeGrad) {
@@ -1260,8 +1190,8 @@ TEST(GradientCheckerTest, TransposeGrad) {
     TensorShape x_shape({2, 3, 4});
     TensorShape y_shape({4, 3, 2});
     const std::vector<ONNX_NAMESPACE::AttributeProto> attributes = {};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error,
-                                                           attributes, true, true /*also test w/o shape inferencing */));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, attributes, true,
+                                                           true /*also test w/o shape inferencing */));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1270,7 +1200,8 @@ TEST(GradientCheckerTest, TransposeGrad) {
     TensorShape x_shape({2, 3, 4});
     TensorShape y_shape({2, 3, 4});
     std::vector<int64_t> perm{0, 1, 2};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1279,7 +1210,8 @@ TEST(GradientCheckerTest, TransposeGrad) {
     TensorShape x_shape({2, 3, 4});
     TensorShape y_shape({2, 4, 3});
     std::vector<int64_t> perm{0, 2, 1};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1288,7 +1220,8 @@ TEST(GradientCheckerTest, TransposeGrad) {
     TensorShape x_shape({2, 3, 4});
     TensorShape y_shape({3, 2, 4});
     std::vector<int64_t> perm{1, 0, 2};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1297,7 +1230,8 @@ TEST(GradientCheckerTest, TransposeGrad) {
     TensorShape x_shape({2, 3, 4});
     TensorShape y_shape({3, 4, 2});
     std::vector<int64_t> perm{1, 2, 0};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1306,7 +1240,8 @@ TEST(GradientCheckerTest, TransposeGrad) {
     TensorShape x_shape({2, 3, 4});
     TensorShape y_shape({4, 2, 3});
     std::vector<int64_t> perm{2, 0, 1};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1315,16 +1250,15 @@ TEST(GradientCheckerTest, TransposeGrad) {
     TensorShape x_shape({2, 3, 4});
     TensorShape y_shape({4, 3, 2});
     std::vector<int64_t> perm{2, 1, 0};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_shape}, {y_shape}, &max_error, {MakeAttribute("perm", perm)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 }
 
-static void RunSqueezeUnsqueezeTests(const OpDef& op_def,
-                                     std::vector<std::vector<int64_t>> x_shapes,
+static void RunSqueezeUnsqueezeTests(const OpDef& op_def, std::vector<std::vector<int64_t>> x_shapes,
                                      std::vector<std::vector<int64_t>> y_shapes,
-                                     std::vector<std::vector<int64_t>> axes_ip,
-                                     bool axes_input = false) {
+                                     std::vector<std::vector<int64_t>> axes_ip, bool axes_input = false) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
   float error_tolerance = 1e-3f;
@@ -1341,14 +1275,16 @@ static void RunSqueezeUnsqueezeTests(const OpDef& op_def,
 
     // Test case w/o axes attribute/input, only valid for Squeeze Op.
     if (op_def.type == "Squeeze") {
-      ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, input, {y_shape}, &max_error, x_datas, attributes));
+      ASSERT_STATUS_OK(
+          gradient_checker.ComputeGradientError(op_def, input, {y_shape}, &max_error, x_datas, attributes));
       EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
     }
 
     // test case w/ axes attribute/input.
     if (axes_input) {
       std::vector<float> axes_float;
-      std::transform(begin(axes), end(axes), std::back_inserter(axes_float), [](int64_t i) { return static_cast<float>(i); });
+      std::transform(begin(axes), end(axes), std::back_inserter(axes_float),
+                     [](int64_t i) { return static_cast<float>(i); });
       TensorInfo axes_info({static_cast<int64_t>(axes.size())}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
       input.push_back(axes_info);
       x_datas.push_back(axes_float);
@@ -1367,31 +1303,22 @@ TEST(GradientCheckerTest, SqueezeGrad) {
     TensorShape y_shape({2, 3});
   */
   std::vector<std::vector<int64_t>> x_shapes = {
-      {1, 2, 3, 1},
-      {1, 1, 2, 3, 4},
-      {1, 2, 1, 3, 1},
-      {1, 2, 1, 3, 1},
+      {1, 2, 3, 1}, {1, 1, 2, 3, 2}, {1, 2, 1, 3, 1}, {1, 2, 1, 3, 1},
       // {1, 2, 1, 3, 1},
   };
   std::vector<std::vector<int64_t>> y_shapes = {
-      {2, 3},
-      {2, 3, 4},
-      {2, 3},
-      {1, 2, 3, 1},
+      {2, 3}, {2, 3, 2}, {2, 3}, {1, 2, 3, 1},
       // {2, 3},
   };
   std::vector<std::vector<int64_t>> axes_ip = {
-      {0, 3},
-      {0, 1},
-      {0, 2, 4},
-      {2},
+      {0, 3}, {0, 1}, {0, 2, 4}, {2},
       // {}
   };
 
   OpDef op_def{"Squeeze"};
   RunSqueezeUnsqueezeTests(op_def, x_shapes, y_shapes, axes_ip);
 
-  //axes as input from opset 13
+  // axes as input from opset 13
   OpDef op_def_2{"Squeeze", kOnnxDomain, 13};
   RunSqueezeUnsqueezeTests(op_def_2, x_shapes, y_shapes, axes_ip, true);
 }
@@ -1416,7 +1343,7 @@ TEST(GradientCheckerTest, UnsqueezeGrad) {
   OpDef op_def{"Unsqueeze"};
   RunSqueezeUnsqueezeTests(op_def, x_shapes, y_shapes, axes_ip);
 
-  //axes as input from opset 13
+  // axes as input from opset 13
   OpDef op_def_2{"Unsqueeze", kOnnxDomain, 13};
   RunSqueezeUnsqueezeTests(op_def_2, x_shapes, y_shapes, axes_ip, true);
 }
@@ -1452,8 +1379,10 @@ TEST(GradientCheckerTest, DISABLED_BatchNormalizationGrad) {
     TensorInfo saved_mean_info(channel_shape, false);
     TensorInfo saved_var_info(channel_shape, false);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, bias_info, mean_info, var_info}, {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
-                                                           {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, scale_info, bias_info, mean_info, var_info},
+        {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
+        {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1475,8 +1404,10 @@ TEST(GradientCheckerTest, DISABLED_BatchNormalizationGrad) {
     TensorInfo saved_mean_info(channel_shape, false);
     TensorInfo saved_var_info(channel_shape, false);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, bias_info, mean_info, var_info}, {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
-                                                           {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, scale_info, bias_info, mean_info, var_info},
+        {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
+        {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1498,8 +1429,10 @@ TEST(GradientCheckerTest, DISABLED_BatchNormalizationGrad) {
     TensorInfo saved_mean_info(channel_shape, false);
     TensorInfo saved_var_info(channel_shape, false);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, bias_info, mean_info, var_info}, {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
-                                                           {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, scale_info, bias_info, mean_info, var_info},
+        {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
+        {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1521,8 +1454,10 @@ TEST(GradientCheckerTest, DISABLED_BatchNormalizationGrad) {
     TensorInfo saved_mean_info(channel_shape, false);
     TensorInfo saved_var_info(channel_shape, false);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, bias_info, mean_info, var_info}, {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
-                                                           {MakeAttribute("momentum", momentum)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, scale_info, bias_info, mean_info, var_info},
+        {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
+        {MakeAttribute("momentum", momentum)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1544,8 +1479,10 @@ TEST(GradientCheckerTest, DISABLED_BatchNormalizationGrad) {
     TensorInfo saved_mean_info(channel_shape, false);
     TensorInfo saved_var_info(channel_shape, false);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, bias_info, mean_info, var_info}, {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
-                                                           {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, scale_info, bias_info, mean_info, var_info},
+        {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
+        {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
@@ -1567,20 +1504,18 @@ TEST(GradientCheckerTest, DISABLED_BatchNormalizationGrad) {
     TensorInfo saved_mean_info(channel_shape, false);
     TensorInfo saved_var_info(channel_shape, false);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, bias_info, mean_info, var_info}, {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error,
-                                                           {MakeAttribute("epsilon", epsilon), MakeAttribute("momentum", momentum)}));
-    EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, bias_info, mean_info, var_info},
+  {y_info, running_mean_info, running_var_info, saved_mean_info, saved_var_info}, &max_error, {MakeAttribute("epsilon",
+  epsilon), MakeAttribute("momentum", momentum)})); EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
   */
 }
 #endif
 
-TEST(GradientCheckerTest, SigmoidGrad) {
-  UnaryOpGradientTest("Sigmoid");
-}
+TEST(GradientCheckerTest, SigmoidGrad) { UnaryOpGradientTest("Sigmoid"); }
 
 void GradientCheckerSoftmaxGradHelper(bool is_log_softmax, int version = 11) {
-  TensorShape shape({3, 4, 5});
+  TensorShape shape({2, 3, 4});
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
 
@@ -1595,19 +1530,22 @@ void GradientCheckerSoftmaxGradHelper(bool is_log_softmax, int version = 11) {
 
   // axis=0
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error, {MakeAttribute("axis", int64_t(0))}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error,
+                                                           {MakeAttribute("axis", int64_t(0))}));
     EXPECT_IS_TINY(max_error);
   }
 
   // axis=1
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error, {MakeAttribute("axis", int64_t(1))}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error,
+                                                           {MakeAttribute("axis", int64_t(1))}));
     EXPECT_IS_TINY(max_error);
   }
 
   // axis=2
   {
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error, {MakeAttribute("axis", int64_t(2))}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error,
+                                                           {MakeAttribute("axis", int64_t(2))}));
     EXPECT_IS_TINY(max_error);
   }
 }
@@ -1659,9 +1597,8 @@ void TestSparseSoftmaxCrossEntropyGrad(const TensorShape& index_shape, const std
     TensorInfo x_info(logit_shape);
     TensorInfo index_info(index_shape, false, &transformer_index, DataTypeImpl::GetTensorType<int64_t>());
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, index_info},
-                                                           {{}, {logit_shape, false}}, &max_error,
-                                                           {MakeAttribute("reduction", reduction)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, index_info}, {{}, {logit_shape, false}},
+                                                           &max_error, {MakeAttribute("reduction", reduction)}));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -1688,16 +1625,16 @@ TEST(GradientCheckerTest, SparseSoftmaxCrossEntropyGrad) {
   TestSparseSoftmaxCrossEntropyGrad({2, 3, 2}, "sum");
 }
 
-void TestSoftmaxCrossEntropyLossGrad(const TensorShape& index_shape,  //label_shape
-                                     const std::string& reduction,
-                                     int64_t ignore_index = 0,
+void TestSoftmaxCrossEntropyLossGrad(const TensorShape& index_shape,  // label_shape
+                                     const std::string& reduction, int64_t ignore_index = 0,
                                      int64_t D = 2 /* num_class*/) {
   float max_error;
   bool include_ignore_index = false;
   bool insert_ignore_index = false;
   GradientChecker<float, float, float> gradient_checker;
   OpDef op_def{"SoftmaxCrossEntropyLoss", kOnnxDomain, 12};
-  std::function<float(float)> transformer_index = [D, &include_ignore_index, &insert_ignore_index, ignore_index](float x) {
+  std::function<float(float)> transformer_index = [D, &include_ignore_index, &insert_ignore_index,
+                                                   ignore_index](float x) {
     if (include_ignore_index) {
       if (insert_ignore_index) {
         insert_ignore_index = false;
@@ -1768,9 +1705,9 @@ void TestSoftmaxCrossEntropyLossGrad(const TensorShape& index_shape,  //label_sh
     TensorInfo x_info(logit_shape);
     TensorInfo index_info(index_shape, false, &transformer_index, DataTypeImpl::GetTensorType<int64_t>());
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, index_info},
-                                                           {loss_info, {logit_shape, false}}, &max_error,
-                                                           {MakeAttribute("reduction", reduction), MakeAttribute("ignore_index", ignore_index)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, index_info}, {loss_info, {logit_shape, false}}, &max_error,
+        {MakeAttribute("reduction", reduction), MakeAttribute("ignore_index", ignore_index)}));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -1789,9 +1726,9 @@ void TestSoftmaxCrossEntropyLossGrad(const TensorShape& index_shape,  //label_sh
     TensorInfo index_info(index_shape, false, &transformer_index, DataTypeImpl::GetTensorType<int64_t>());
     TensorInfo weight_info({logit_shape[1]}, false, &transformer_weight);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, index_info, weight_info},
-                                                           {loss_info, {logit_shape, false}}, &max_error,
-                                                           {MakeAttribute("reduction", reduction), MakeAttribute("ignore_index", ignore_index)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, index_info, weight_info}, {loss_info, {logit_shape, false}}, &max_error,
+        {MakeAttribute("reduction", reduction), MakeAttribute("ignore_index", ignore_index)}));
     EXPECT_IS_TINY(max_error);
   }
 }
@@ -1813,13 +1750,9 @@ TEST(GradientCheckerTest, DISABLED_SoftmaxCrossEntropyLossGrad) {
   TestSoftmaxCrossEntropyLossGrad({2, 3, 2}, "none", -1);
 }
 
-TEST(GradientCheckerTest, GeluGrad) {
-  UnaryOpGradientTest("Gelu", kMSDomain, 1);
-}
+TEST(GradientCheckerTest, GeluGrad) { UnaryOpGradientTest("Gelu", kMSDomain, 1); }
 
-TEST(GradientCheckerTest, FastGeluGrad) {
-  UnaryOpGradientTest("FastGelu", kMSDomain, 1);
-}
+TEST(GradientCheckerTest, FastGeluGrad) { UnaryOpGradientTest("FastGelu", kMSDomain, 1); }
 
 // used for BiasGelu and FastGelu
 void TestBiasGeluGrad(const std::string& op_type, const std::string& domain, int opset_version) {
@@ -1831,28 +1764,23 @@ void TestBiasGeluGrad(const std::string& op_type, const std::string& domain, int
   const std::vector<ONNX_NAMESPACE::AttributeProto> attributes = {};
 
   float max_error;
-  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
-      op_def, {input_shape, bias_shape}, {input_shape}, &max_error,
-      attributes, true, true));
+  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {input_shape, bias_shape}, {input_shape}, &max_error,
+                                                         attributes, true, true));
 
   EXPECT_IS_TINY(max_error);
 }
 
-TEST(GradientCheckerTest, FastGeluGrad_Bias) {
-  TestBiasGeluGrad("FastGelu", kMSDomain, 1);
-}
+TEST(GradientCheckerTest, FastGeluGrad_Bias) { TestBiasGeluGrad("FastGelu", kMSDomain, 1); }
 
-TEST(GradientCheckerTest, BiasGeluGrad) {
-  TestBiasGeluGrad("BiasGelu", kMSDomain, 1);
-}
+TEST(GradientCheckerTest, BiasGeluGrad) { TestBiasGeluGrad("BiasGelu", kMSDomain, 1); }
 
 TEST(GradientCheckerTest, GatherGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
   OpDef op_def{"Gather"};
 
-  TensorInfo x_info({5, 4, 3, 2});
-  std::function<float(float)> transformer = [](float x) { return std::fmod(7 * std::fabs(x), 5.0f); };
+  TensorInfo x_info({4, 2, 3, 2});
+  std::function<float(float)> transformer = [](float x) { return std::fmod(7 * std::fabs(x), 4.0f); };
 
   // gather_0 without duplicated indices
   {
@@ -1870,7 +1798,7 @@ TEST(GradientCheckerTest, GatherGrad) {
 
   // gather_0 with duplicated indices
   {
-    int num_indices = 10;
+    int num_indices = 8;
     TensorInfo indices_info({num_indices}, false, &transformer, DataTypeImpl::GetTensorType<int64_t>());
 
     TensorShape y_shape{x_info.shape};
@@ -1884,8 +1812,8 @@ TEST(GradientCheckerTest, GatherGrad) {
 
   // gather_1
   {
-    int num_indices = 8;
-    std::function<float(float)> transformer2 = [](float x) { return std::fmod(7 * std::fabs(x), 4.0f); };
+    int num_indices = 3;
+    std::function<float(float)> transformer2 = [](float x) { return std::fmod(7 * std::fabs(x), 2.0f); };
     TensorInfo indices_info({num_indices}, false, &transformer2, DataTypeImpl::GetTensorType<int64_t>());
 
     TensorShape y_shape{x_info.shape};
@@ -1901,7 +1829,7 @@ TEST(GradientCheckerTest, GatherGrad) {
   {
     TensorInfo indices_info({2, 3}, false, &transformer, DataTypeImpl::GetTensorType<int64_t>());
 
-    TensorShape y_shape{2, 3, 4, 3, 2};
+    TensorShape y_shape{2, 3, 2, 3, 2};
 
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indices_info}, {y_shape}, &max_error,
                                                            {MakeAttribute("axis", int64_t(0))}));
@@ -1919,31 +1847,31 @@ TEST(GradientCheckerTest, GatherGrad) {
     int64_t axis = 0;
     y_shape[axis] = 3;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info_2, indices_info}, {y_shape}, &max_error, x_datas,
-                                                           {MakeAttribute("axis", axis)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info_2, indices_info}, {y_shape}, &max_error,
+                                                           x_datas, {MakeAttribute("axis", axis)}));
     EXPECT_IS_TINY(max_error);
   }
 }
 
 void TestDropoutOp(float ratio, TensorShape& x_shape, bool default_ratio = true) {
   OpTester test("Dropout", 12, kOnnxDomain, false);
-  if (default_ratio)
-    ratio = 0.5f;
+  if (default_ratio) ratio = 0.5f;
   float input_constant = 3.0f;
   std::vector<float> x_data(x_shape.Size(), input_constant);
   std::vector<float> y_data(x_shape.Size(), 3.0f);
 
   test.AddInput<float>("x", x_shape.AsShapeVector(), x_data);
-  if (!default_ratio)
-    test.AddInput<float>("ratio", {}, {ratio});
+  if (!default_ratio) test.AddInput<float>("ratio", {}, {ratio});
   test.AddOutput<float>("y", x_shape.AsShapeVector(), y_data);
-  test.AddOutput<bool>("mask", x_shape.AsShapeVector(), {true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true});
+  test.AddOutput<bool>(
+      "mask", x_shape.AsShapeVector(),
+      {true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true});
   test.Run();
 
-  //Check output
+  // Check output
   auto fwd_output = test.GetFetches();
   for (size_t idx = 0; idx < x_data.size() / 8; ++idx) {
-    //convert the binary to bool
+    // convert the binary to bool
     if (ratio > 0) {
       std::bitset<8> mask(fwd_output[1].Get<Tensor>().Data<bool>()[idx]);
       for (size_t i = 0; i < 8; ++i) {
@@ -1965,25 +1893,23 @@ void TestDropoutOp(float ratio, TensorShape& x_shape, bool default_ratio = true)
 
 void TestDropoutGradOp(float ratio, TensorShape& x_shape, bool default_ratio = true) {
   OpTester test("DropoutGrad", 1, kMSDomain, true);
-  if (default_ratio)
-    ratio = 0.5;
+  if (default_ratio) ratio = 0.5;
   float input_constant = 3;
 
   std::vector<float> dy_data(x_shape.Size(), input_constant);
   std::vector<float> ratio_data(1, ratio);
 
   float output_constant = input_constant / (1 - ratio);
-  std::vector<float> dx_data({output_constant, output_constant, output_constant, 0,
-                              output_constant, 0, output_constant, 0,
-                              output_constant, 0, output_constant, 0,
-                              output_constant, 0, output_constant, 0});
+  std::vector<float> dx_data({output_constant, output_constant, output_constant, 0, output_constant, 0, output_constant,
+                              0, output_constant, 0, output_constant, 0, output_constant, 0, output_constant, 0});
 
   test.AddInput<float>("dy", x_shape.AsShapeVector(), dy_data);
 
-  test.AddInput<bool>("mask", x_shape.AsShapeVector(), {true, true, true, false,   //
-                                                          true, false, true, false,  //
-                                                          true, false, true, false,  //
-                                                          true, false, true, false});
+  test.AddInput<bool>("mask", x_shape.AsShapeVector(),
+                      {true, true, true, false,   //
+                       true, false, true, false,  //
+                       true, false, true, false,  //
+                       true, false, true, false});
   if (!default_ratio) {
     test.AddInput<float>("ratio", {1}, ratio_data);
   } else {
@@ -2000,22 +1926,22 @@ void TestDropoutGradOp(float ratio, TensorShape& x_shape, bool default_ratio = t
 #ifdef USE_CUDA
 TEST(GradientCheckerTest, DISABLED_Dropout) {
   {
-    //Ratio 0
+    // Ratio 0
     TensorShape x_shape({2, 2, 2, 2});
     TestDropoutOp(0.0f, x_shape, false);
   }
-  //Ratio 0.2, 3D
+  // Ratio 0.2, 3D
   {
     TensorShape x_shape({4, 2, 2});
     TestDropoutOp(0.2f, x_shape, false);
   }
-  //Ratio 0.4, 2D
+  // Ratio 0.4, 2D
   {
     TensorShape x_shape({4, 4});
     TestDropoutOp(0.4f, x_shape, false);
   }
 
-  //Default ratio, 1D
+  // Default ratio, 1D
   {
     TensorShape x_shape({16});
     TestDropoutOp(0.2f, x_shape, true);
@@ -2024,30 +1950,30 @@ TEST(GradientCheckerTest, DISABLED_Dropout) {
 
 TEST(GradientCheckerTest, DISABLED_DropoutGrad) {
   {
-    //Ratio 0
+    // Ratio 0
     TensorShape x_shape({8, 2});
     TestDropoutGradOp(0.0f, x_shape);
   }
 
-  //Ratio 0.2, 1D
+  // Ratio 0.2, 1D
   {
     TensorShape x_shape({16});
     TestDropoutGradOp(0.2f, x_shape, false);
   }
 
-  //Ratio 0.3, 2D
+  // Ratio 0.3, 2D
   {
     TensorShape x_shape({8, 2});
     TestDropoutGradOp(0.3f, x_shape, false);
   }
 
-  //Ratio 0.4, 3D
+  // Ratio 0.4, 3D
   {
     TensorShape x_shape({2, 4, 2});
     TestDropoutGradOp(0.4f, x_shape, false);
   }
 
-  //default Ratio, 4D
+  // default Ratio, 4D
   {
     TensorShape x_shape({2, 4, 2});
     TestDropoutGradOp(0.6f, x_shape);
@@ -2066,7 +1992,8 @@ TEST(GradientCheckerTest, GatherNDGrad_repeat_float_data) {
   TensorInfo y_info({2}, true);
   int64_t batch_dims = 0;
 
-  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indice_info}, {y_info}, &max_error, x_datas, {MakeAttribute("batch_dims", batch_dims)}));
+  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indice_info}, {y_info}, &max_error, x_datas,
+                                                         {MakeAttribute("batch_dims", batch_dims)}));
   EXPECT_IS_TINY(max_error);
 }
 
@@ -2083,7 +2010,8 @@ TEST(GradientCheckerTest, GatherNDGrad_unique_float_data) {
     TensorInfo y_info({2}, true);
     int64_t batch_dims = 0;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indice_info}, {y_info}, &max_error, x_datas, {MakeAttribute("batch_dims", batch_dims)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indice_info}, {y_info}, &max_error, x_datas,
+                                                           {MakeAttribute("batch_dims", batch_dims)}));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2095,7 +2023,8 @@ TEST(GradientCheckerTest, GatherNDGrad_unique_float_data) {
     TensorInfo y_info({2, 3}, true);
     int64_t batch_dims = 1;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indice_info}, {y_info}, &max_error, x_datas, {MakeAttribute("batch_dims", batch_dims)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indice_info}, {y_info}, &max_error, x_datas,
+                                                           {MakeAttribute("batch_dims", batch_dims)}));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2107,7 +2036,8 @@ TEST(GradientCheckerTest, GatherNDGrad_unique_float_data) {
     TensorInfo y_info({2, 2}, true);
     int64_t batch_dims = 2;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indice_info}, {y_info}, &max_error, x_datas, {MakeAttribute("batch_dims", batch_dims)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, indice_info}, {y_info}, &max_error, x_datas,
+                                                           {MakeAttribute("batch_dims", batch_dims)}));
     EXPECT_IS_TINY(max_error);
   }
 }
@@ -2126,7 +2056,8 @@ TEST(GradientCheckerTest, LayerNormGrad) {
     float error_tolerance = 1e-2f;
 
     OpDef op_def{"LayerNormalization"};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, B_info}, {shape, mean_info, var_info}, &max_error));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info, B_info},
+                                                           {shape, mean_info, var_info}, &max_error));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 }
@@ -2143,11 +2074,12 @@ TEST(GradientCheckerTest, SimplifiedLayerNormGrad) {
     float error_tolerance = 1e-2f;
 
     OpDef op_def{"SimplifiedLayerNormalization"};
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, scale_info}, {shape, var_info}, &max_error));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_info, scale_info}, {shape, var_info}, &max_error));
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 }
-#endif  //USE_CUDA
+#endif  // USE_CUDA
 
 TEST(GradientUtilsTest, InPlaceAccumulatorFloat32) {
   OpTester test("InPlaceAccumulator", 1, onnxruntime::kMSDomain);
@@ -2178,7 +2110,7 @@ TEST(GradientUtilsTest, InPlaceAccumulatorFloat16) {
   // Didn't implement mixed precision InPlaceAccumulator in CPU
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
 }
-#endif  //defined(USE_CUDA) || defined(USE_ROCM)
+#endif  // defined(USE_CUDA) || defined(USE_ROCM)
 
 TEST(GradientUtilsTest, ZeroGradientFloat32) {
   OpTester test("ZeroGradient", 1, onnxruntime::kMSDomain);
@@ -2226,7 +2158,8 @@ TEST(GradientCheckerTest, WhereGrad) {
   TensorInfo condition_info(shape, false, &transformer, DataTypeImpl::GetTensorType<bool>());
 
   TensorShape output_shape{shape};
-  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {condition_info, x_info, y_info}, {output_shape}, &max_error));
+  ASSERT_STATUS_OK(
+      gradient_checker.ComputeGradientError(op_def, {condition_info, x_info, y_info}, {output_shape}, &max_error));
   EXPECT_IS_TINY(max_error);
 }
 
@@ -2244,7 +2177,8 @@ TEST(GradientCheckerTest, SliceGrad) {
 
     TensorInfo y_info({1, 3}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, start_info, end_info}, {y_info}, &max_error, x_datas));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_info, start_info, end_info}, {y_info}, &max_error, x_datas));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2259,8 +2193,8 @@ TEST(GradientCheckerTest, SliceGrad) {
 
     TensorInfo y_info({1, 2}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, start_info, end_info, axes_info, steps_info},
-                                                           {y_info}, &max_error, x_datas));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, start_info, end_info, axes_info, steps_info}, {y_info}, &max_error, x_datas));
 
     EXPECT_IS_TINY(max_error);
   }
@@ -2272,12 +2206,13 @@ TEST(GradientCheckerTest, SliceGrad) {
     TensorInfo end_info({2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
     TensorInfo axes_info({2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
     TensorInfo steps_info({2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8}, {1, 0}, {2, 3}, {0, 1}, {1, 2}};
+    std::vector<std::vector<float>> x_datas = {
+        {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8}, {1, 0}, {2, 3}, {0, 1}, {1, 2}};
 
     TensorInfo y_info({1, 2, 2}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, start_info, end_info, axes_info, steps_info}, {y_info},
-                                                           &max_error, x_datas));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(
+        op_def, {x_info, start_info, end_info, axes_info, steps_info}, {y_info}, &max_error, x_datas));
 
     EXPECT_IS_TINY(max_error);
   }
@@ -2288,75 +2223,81 @@ void RunExpandGradTests(const OpDef& op_def) {
   GradientChecker<float, float, float> gradient_checker;
   const std::vector<ONNX_NAMESPACE::AttributeProto> attributes = {};
 
-  //input_shape = (2, 3, 1), target_shape = (2, 3, 4) ==> shape(result) = (2, 3, 4)
+  // input_shape = (2, 2, 1), target_shape = (2, 2, 3) ==> shape(result) = (2, 2, 3)
   {
-    TensorInfo x_info({2, 3, 1}, true);
+    TensorInfo x_info({2, 2, 1}, true);
     TensorInfo shape_info({3}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {2, 3, 4}};
+    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4}, {2, 2, 3}};
 
-    TensorInfo y_info({2, 3, 4}, true);
+    TensorInfo y_info({2, 2, 3}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas,
+                                                           attributes, true, true));
     EXPECT_IS_TINY(max_error);
   }
 
-  //input_shape = (2, 3, 1), target_shape = (1, 1, 4) ==> shape(result) = (2, 3, 4)
+  // input_shape = (2, 2, 1), target_shape = (1, 1, 3) ==> shape(result) = (2, 2, 3)
   {
-    TensorInfo x_info({2, 3, 1}, true);
+    TensorInfo x_info({2, 2, 1}, true);
     TensorInfo shape_info({3}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {1, 1, 4}};
+    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4}, {1, 1, 3}};
 
-    TensorInfo y_info({2, 3, 4}, true);
+    TensorInfo y_info({2, 2, 3}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas,
+                                                           attributes, true, true));
     EXPECT_IS_TINY(max_error);
   }
 
-  //input_shape = (2, 3, 1), target_shape = (4) ==> shape(result) = (2, 3, 4)
+  // input_shape = (2, 2, 1), target_shape = (3) ==> shape(result) = (2, 2, 3)
   {
-    TensorInfo x_info({2, 3, 1}, true);
+    TensorInfo x_info({2, 2, 1}, true);
     TensorInfo shape_info({1}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {4}};
+    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4}, {3}};
 
-    TensorInfo y_info({2, 3, 4}, true);
+    TensorInfo y_info({2, 2, 3}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas,
+                                                           attributes, true, true));
     EXPECT_IS_TINY(max_error);
   }
 
-  //input_shape = (2, 3, 1), target_shape = (1, 1) ==> shape(result) = (2, 3, 1)
+  // input_shape = (2, 2, 1), target_shape = (1, 1) ==> shape(result) = (2, 2, 1)
   {
-    TensorInfo x_info({2, 3, 1}, true);
+    TensorInfo x_info({2, 2, 1}, true);
     TensorInfo shape_info({2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {1, 1}};
+    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4}, {1, 1}};
 
-    TensorInfo y_info({2, 3, 1}, true);
+    TensorInfo y_info({2, 2, 1}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas,
+                                                           attributes, true, true));
     EXPECT_IS_TINY(max_error);
   }
 
-  //input_shape = (2, 3), target_shape = (4, 5, 2, 3) ==> shape(result) = (4, 5, 2, 3)
+  // input_shape = (2, 3), target_shape = (3, 2, 2, 3) ==> shape(result) = (3, 2, 2, 3)
   {
     TensorInfo x_info({2, 3}, true);
     TensorInfo shape_info({4}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {4, 5, 2, 3}};
+    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {3, 2, 2, 3}};
 
-    TensorInfo y_info({4, 5, 2, 3}, true);
+    TensorInfo y_info({3, 2, 2, 3}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas,
+                                                           attributes, true, true));
     EXPECT_IS_TINY(max_error);
   }
 
-  //input_shape = (1, 2, 3), target_shape = (4, 5, 1, 1) ==> shape(result) = (4, 5, 2, 3)
+  // input_shape = (1, 2, 3), target_shape = (3, 2, 1, 1) ==> shape(result) = (3, 2, 2, 3)
   {
     TensorInfo x_info({1, 2, 3}, true);
     TensorInfo shape_info({4}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {4, 5, 1, 1}};
+    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {3, 2, 1, 1}};
 
-    TensorInfo y_info({4, 5, 2, 3}, true);
+    TensorInfo y_info({3, 2, 2, 3}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas, attributes, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, shape_info}, {y_info}, &max_error, x_datas,
+                                                           attributes, true, true));
     EXPECT_IS_TINY(max_error);
   }
 }
@@ -2382,8 +2323,8 @@ TEST(GradientCheckerTest, GatherElementsGrad) {
     TensorInfo y_info({2, 3}, true);
     int64_t axis = 0;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error, x_datas,
-                                                           {MakeAttribute("axis", axis)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error,
+                                                           x_datas, {MakeAttribute("axis", axis)}));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2396,8 +2337,8 @@ TEST(GradientCheckerTest, GatherElementsGrad) {
     TensorInfo y_info({2, 3}, true);
     int64_t axis = 0;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error, x_datas,
-                                                           {MakeAttribute("axis", axis)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error,
+                                                           x_datas, {MakeAttribute("axis", axis)}));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2410,8 +2351,8 @@ TEST(GradientCheckerTest, GatherElementsGrad) {
     TensorInfo y_info({2, 3}, true);
     int64_t axis = 1;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error, x_datas,
-                                                           {MakeAttribute("axis", axis)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error,
+                                                           x_datas, {MakeAttribute("axis", axis)}));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2424,8 +2365,8 @@ TEST(GradientCheckerTest, GatherElementsGrad) {
     TensorInfo y_info({2, 1, 2}, true);
     int64_t axis = 1;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error, x_datas,
-                                                           {MakeAttribute("axis", axis)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error,
+                                                           x_datas, {MakeAttribute("axis", axis)}));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2438,8 +2379,8 @@ TEST(GradientCheckerTest, GatherElementsGrad) {
     TensorInfo y_info({2, 4}, true);
     int64_t axis = 1;
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error, x_datas,
-                                                           {MakeAttribute("axis", axis)}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {data_info, indice_info}, {y_info}, &max_error,
+                                                           x_datas, {MakeAttribute("axis", axis)}));
     EXPECT_IS_TINY(max_error);
   }
 }
@@ -2455,7 +2396,8 @@ TEST(GradientCheckerTest, TopKGrad) {
     std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6, 7, 8}, {1}};
     TensorInfo y1_info({2, 2, 1}, true);
     TensorInfo y2_info({2, 2, 1}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y1_info, y2_info}, &max_error, x_datas, {}, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y1_info, y2_info}, &max_error,
+                                                           x_datas, {}, true, true));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2465,7 +2407,8 @@ TEST(GradientCheckerTest, TopKGrad) {
     std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6, 7, 8}, {1}};
     TensorInfo y1_info({2, 1, 2}, true);
     TensorInfo y2_info({2, 1, 2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y1_info, y2_info}, &max_error, x_datas, {MakeAttribute("axis", int64_t(-2))}, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y1_info, y2_info}, &max_error,
+                                                           x_datas, {MakeAttribute("axis", int64_t(-2))}, true, true));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2475,7 +2418,8 @@ TEST(GradientCheckerTest, TopKGrad) {
     std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6, 7, 8, 9}, {2}};
     TensorInfo y1_info({3, 2}, true);
     TensorInfo y2_info({3, 2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y1_info, y2_info}, &max_error, x_datas, {}, true, true));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y1_info, y2_info}, &max_error,
+                                                           x_datas, {}, true, true));
     EXPECT_IS_TINY(max_error);
   }
 }
@@ -2491,7 +2435,8 @@ TEST(GradientCheckerTest, ClipGrad) {
     TensorInfo max_info({}, false);
     std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6, 7, 8}, {2.8f}, {7.2f}};
     TensorInfo y_info({2, 2, 2}, true);
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, min_info, max_info}, {y_info}, &max_error, x_datas));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_info, min_info, max_info}, {y_info}, &max_error, x_datas));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2564,13 +2509,9 @@ void GradientCheckerMinMaxGradHelper(const std::string op) {
   }
 }
 
-TEST(GradientCheckerTest, MinGrad) {
-  GradientCheckerMinMaxGradHelper("Min");
-}
+TEST(GradientCheckerTest, MinGrad) { GradientCheckerMinMaxGradHelper("Min"); }
 
-TEST(GradientCheckerTest, MaxGrad) {
-  GradientCheckerMinMaxGradHelper("Max");
-}
+TEST(GradientCheckerTest, MaxGrad) { GradientCheckerMinMaxGradHelper("Max"); }
 
 TEST(GradientCheckerTest, TileGrad) {
   float max_error;
@@ -2579,13 +2520,14 @@ TEST(GradientCheckerTest, TileGrad) {
 
   // 2D input
   {
-    TensorInfo x_info({2, 4}, true);
+    TensorInfo x_info({2, 3}, true);
     TensorInfo repeat_info({2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6, 7, 8}, {2, 2}};
+    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6}, {2, 2}};
 
-    TensorInfo y_info({4, 8}, true);
+    TensorInfo y_info({4, 6}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, repeat_info}, {y_info}, &max_error, x_datas));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_info, repeat_info}, {y_info}, &max_error, x_datas));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2593,11 +2535,12 @@ TEST(GradientCheckerTest, TileGrad) {
   {
     TensorInfo x_info({2}, true);
     TensorInfo repeat_info({1}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2}, {4}};
+    std::vector<std::vector<float>> x_datas = {{1, 2}, {2}};
 
-    TensorInfo y_info({8}, true);
+    TensorInfo y_info({4}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, repeat_info}, {y_info}, &max_error, x_datas));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_info, repeat_info}, {y_info}, &max_error, x_datas));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2605,11 +2548,12 @@ TEST(GradientCheckerTest, TileGrad) {
   {
     TensorInfo x_info({2, 2, 3}, true);
     TensorInfo repeat_info({3}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
-    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 3, 4}};
+    std::vector<std::vector<float>> x_datas = {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 3, 2}};
 
-    TensorInfo y_info({4, 6, 12}, true);
+    TensorInfo y_info({4, 6, 6}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, repeat_info}, {y_info}, &max_error, x_datas));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_info, repeat_info}, {y_info}, &max_error, x_datas));
     EXPECT_IS_TINY(max_error);
   }
 
@@ -2621,7 +2565,8 @@ TEST(GradientCheckerTest, TileGrad) {
 
     TensorInfo y_info({2, 2, 3}, true);
 
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, repeat_info}, {y_info}, &max_error, x_datas));
+    ASSERT_STATUS_OK(
+        gradient_checker.ComputeGradientError(op_def, {x_info, repeat_info}, {y_info}, &max_error, x_datas));
     EXPECT_IS_TINY(max_error);
   }
 }
@@ -2790,8 +2735,7 @@ TEST(GradientCheckerTest, ScatterElementsGrad) {
     TensorInfo data_info({3, 3}, true);
     TensorInfo indices_info({2, 3}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
     TensorInfo updates_info({2, 3}, true);
-    std::vector<std::vector<float>> input_datas = {{ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                                                     0.0f, 0.0f, 0.0f, 0.0f},
+    std::vector<std::vector<float>> input_datas = {{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
                                                    {1, 0, 2, 0, 2, 1},
                                                    {1.0f, 1.1f, 1.2f, 2.0f, 2.1f, 2.2f}};
 
@@ -2806,9 +2750,7 @@ TEST(GradientCheckerTest, ScatterElementsGrad) {
     TensorInfo data_info({1, 5}, true);
     TensorInfo indices_info({1, 2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
     TensorInfo updates_info({1, 2}, true);
-    std::vector<std::vector<float>> input_datas = {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f},
-                                                   {1, 3},
-                                                   {1.1f, 2.1f}};
+    std::vector<std::vector<float>> input_datas = {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f}, {1, 3}, {1.1f, 2.1f}};
 
     TensorInfo output_info({1, 5}, true);
 
@@ -2822,9 +2764,7 @@ TEST(GradientCheckerTest, ScatterElementsGrad) {
     TensorInfo data_info({1, 5}, true);
     TensorInfo indices_info({1, 2}, false, nullptr, DataTypeImpl::GetTensorType<int64_t>());
     TensorInfo updates_info({1, 2}, true);
-    std::vector<std::vector<float>> input_datas = {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f},
-                                                   {1, 3},
-                                                   {1.1f, 2.1f}};
+    std::vector<std::vector<float>> input_datas = {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f}, {1, 3}, {1.1f, 2.1f}};
 
     TensorInfo output_info({1, 5}, true);
 
@@ -2861,12 +2801,14 @@ TEST(GradientCheckerTest, TriluGrad) {
   }
   {
     // Test without optional input and with attribute upper=1
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info}, {y_info}, &max_error, {x_data}, {MakeAttribute("upper", int64_t(1))}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info}, {y_info}, &max_error, {x_data},
+                                                           {MakeAttribute("upper", int64_t(1))}));
     EXPECT_IS_TINY(max_error);
   }
   {
     // Test without optional input and with attribute upper=0
-    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info}, {y_info}, &max_error, {x_data}, {MakeAttribute("upper", int64_t(0))}));
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info}, {y_info}, &max_error, {x_data},
+                                                           {MakeAttribute("upper", int64_t(0))}));
     EXPECT_IS_TINY(max_error);
   }
   for (int64_t k = -M; k <= M; k++) {
@@ -2874,17 +2816,20 @@ TEST(GradientCheckerTest, TriluGrad) {
 
     // Test with optional input and without attribute
     {
-      ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y_info}, &max_error, {x_data, k_data}));
+      ASSERT_STATUS_OK(
+          gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y_info}, &max_error, {x_data, k_data}));
       EXPECT_IS_TINY(max_error);
     }
     {
       // Test with optional input and with attribute upper=1
-      ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y_info}, &max_error, {x_data, k_data}, {MakeAttribute("upper", int64_t(1))}));
+      ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y_info}, &max_error,
+                                                             {x_data, k_data}, {MakeAttribute("upper", int64_t(1))}));
       EXPECT_IS_TINY(max_error);
     }
     {
       // Test with optional input and with attribute upper=0
-      ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y_info}, &max_error, {x_data, k_data}, {MakeAttribute("upper", int64_t(0))}));
+      ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, k_info}, {y_info}, &max_error,
+                                                             {x_data, k_data}, {MakeAttribute("upper", int64_t(0))}));
       EXPECT_IS_TINY(max_error);
     }
   }