User lower case while comparing the activation method as it's not clear in the spec. (#77)

Fix the bug for RNN/GRU/LSTM auto fall back, the supported activation should cover bidirectional mode. Update the test to remove the flag to enable gpu test since we can auto fallback to cpu
2026-05-23 22:13:38 +00:00 · 2018-12-03 14:21:23 -08:00 · 2018-12-03 14:21:23 -08:00 · 900e69ceae
commit 900e69ceae
parent f1c66a4aae
3 changed files with 74 additions and 125 deletions
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@ -784,7 +784,10 @@ bool CUDAExecutionProvider::RNNNeedFallbackToCPU(const onnxruntime::Node& node,
    if ("activations" == attr_name &&
        ::onnx::AttributeProto_AttributeType::AttributeProto_AttributeType_STRINGS == attr_value.type()) {
      for (int i = 0; i < attr_value.strings_size(); ++i) {
-        if (activations_supported[i] != attr_value.strings(i)) {
+        std::string activation_lowercase(attr_value.strings(i));
+        std::transform(activation_lowercase.begin(), activation_lowercase.end(), activation_lowercase.begin(),
+                       [](const unsigned char i) { return static_cast<char>(::tolower(i)); });
+        if (activations_supported[i] != activation_lowercase) {
          return true;
        }
      }
@ -829,13 +832,14 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
  for (auto& node : graph.Nodes()) {
    bool fallback_to_cpu_provider = false;
    if ("LSTM" == node.OpType()) {
-      std::vector<std::string> activations_supported{"Sigmoid", "Tanh", "Tanh"};
+      // the supported activations covers the bidirectional mode
+      std::vector<std::string> activations_supported{"sigmoid", "tanh", "tanh", "sigmoid", "tanh", "tanh"};
      fallback_to_cpu_provider = RNNNeedFallbackToCPU(node, activations_supported, node.OpType());
    } else if ("RNN" == node.OpType()) {
-      std::vector<std::string> activations_supported{"Tanh", "Tanh"};
+      std::vector<std::string> activations_supported{"tanh", "tanh"};
      fallback_to_cpu_provider = RNNNeedFallbackToCPU(node, activations_supported, node.OpType());
    } else if ("GRU" == node.OpType()) {
-      std::vector<std::string> activations_supported{"Sigmoid", "Tanh"};
+      std::vector<std::string> activations_supported{"sigmoid", "tanh", "sigmoid", "tanh"};
      fallback_to_cpu_provider = RNNNeedFallbackToCPU(node, activations_supported, node.OpType());
    }

--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc
@ -12,8 +12,7 @@ using namespace std;
 namespace onnxruntime {
 namespace test {

-static void RunGruTest(bool run_on_gpu,
-                       const std::vector<float>& X_data,
+static void RunGruTest(const std::vector<float>& X_data,
                       const std::vector<float>& W_data,
                       const std::vector<float>& R_data,
                       const std::vector<float>& Y_data,
@ -95,16 +94,10 @@ static void RunGruTest(bool run_on_gpu,
  } else {
    test.AddMissingOptionalOutput<float>();
  }
-
-  std::unordered_set<std::string> excluded_provider_types;
-  if (!run_on_gpu) {
-    excluded_provider_types.insert(kCudaExecutionProvider);
-  }
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_provider_types);
+  test.Run();
 }

-void DefaultActivationsSimpleWeightsNoBias(bool run_on_gpu,
-                                           std::string direction,
+void DefaultActivationsSimpleWeightsNoBias(std::string direction,
                                           const std::vector<float>& Y_data,
                                           const std::vector<float>& Y_h_data) {
  int64_t seq_length = 2;
@ -129,13 +122,13 @@ void DefaultActivationsSimpleWeightsNoBias(bool run_on_gpu,

  std::vector<float> R_data(num_directions * 3 * hidden_size * hidden_size, 0.1f);

-  RunGruTest(run_on_gpu, X_data, W_data, R_data, Y_data, Y_h_data, input_size, batch_size, hidden_size, seq_length,
+  RunGruTest(X_data, W_data, R_data, Y_data, Y_h_data, input_size, batch_size, hidden_size, seq_length,
             nullptr, nullptr, nullptr, direction);

  // if Y_h_data is empty that tests Y_h not being returned. we need to have at least one output or
  // the node will get removed, so only test with output_sequence == false (no Y as output) if Y_h is not optional
  if (!Y_h_data.empty())
-    RunGruTest(run_on_gpu, X_data, W_data, R_data, Y_data, Y_h_data, input_size, batch_size, hidden_size, seq_length,
+    RunGruTest(X_data, W_data, R_data, Y_data, Y_h_data, input_size, batch_size, hidden_size, seq_length,
               nullptr, nullptr, nullptr, direction, 9999.0, /* output_sequence*/ false);
 }

@ -151,11 +144,10 @@ TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows) {
      0.6027093f, 0.5083023f, 0.44950223f,
      0.5754369f, 0.45485455f, 0.3747841f};

-  bool run_on_gpu = true;
-  DefaultActivationsSimpleWeightsNoBias(run_on_gpu, "forward", Y_data, Y_h_data);
+  DefaultActivationsSimpleWeightsNoBias("forward", Y_data, Y_h_data);

  // test Y_h not being returned
-  DefaultActivationsSimpleWeightsNoBias(run_on_gpu, "forward", Y_data, {});
+  DefaultActivationsSimpleWeightsNoBias("forward", Y_data, {});
 }

 TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows) {
@ -170,8 +162,7 @@ TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows) {
      0.6082785f, 0.50623393f, 0.4426924f,
      0.5803454f, 0.4527356f, 0.36886263f};

-  bool run_on_gpu = false;  // cudnn implementation only support linear_before_reset = true
-  DefaultActivationsSimpleWeightsNoBias(run_on_gpu, "reverse", Y_data, Y_h_data);
+  DefaultActivationsSimpleWeightsNoBias("reverse", Y_data, Y_h_data);
 }

 TEST(GRUTest, BidirectionalDefaultActivationsSimpleWeightsNoBiasTwoRows) {
@ -201,12 +192,10 @@ TEST(GRUTest, BidirectionalDefaultActivationsSimpleWeightsNoBiasTwoRows) {
      0.6082785f, 0.50623393f, 0.4426924f,
      0.5803454f, 0.4527356f, 0.36886263f};

-  bool run_on_gpu = true;
-  DefaultActivationsSimpleWeightsNoBias(run_on_gpu, "bidirectional", Y_data, Y_h_data);
+  DefaultActivationsSimpleWeightsNoBias("bidirectional", Y_data, Y_h_data);
 }

-void DefaultActivationsSimpleWeightsWithBias(bool run_on_gpu,
-                                             std::string direction,
+void DefaultActivationsSimpleWeightsWithBias(std::string direction,
                                             const std::vector<float>& Y_data,
                                             bool linear_before_reset = false,
                                             bool one_row = false) {
@ -250,7 +239,7 @@ void DefaultActivationsSimpleWeightsWithBias(bool run_on_gpu,

  std::vector<float> R_data(num_directions * 3 * hidden_size * hidden_size, 0.1f);

-  RunGruTest(run_on_gpu, X_data, W_data, R_data, Y_data, {}, input_size, batch_size, hidden_size, seq_length,
+  RunGruTest(X_data, W_data, R_data, Y_data, {}, input_size, batch_size, hidden_size, seq_length,
             &B_data, nullptr, nullptr, direction, 999.f, /* output_sequence*/ true, linear_before_reset);
 }  // namespace test

@ -262,8 +251,7 @@ TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallel) {
      0.22688604f, -0.19698407f, 0.14017843f,
      0.33386092f, -0.15799662f, 0.2381169f};

-  bool run_on_gpu = false;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "forward", Y_data);
+  DefaultActivationsSimpleWeightsWithBias("forward", Y_data);
 }

 TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset) {
@ -274,9 +262,8 @@ TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearB
      0.19538902f, -0.19016478f, -0.05644283f,
      0.30856851f, -0.15190377f, 0.05999807f};

-  bool run_on_gpu = true;
  const bool linear_before_reset = true;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "forward", Y_data, linear_before_reset);
+  DefaultActivationsSimpleWeightsWithBias("forward", Y_data, linear_before_reset);
 }

 TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset) {
@ -287,9 +274,8 @@ TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearB
      0.12252139f, -0.12032216f, -0.05064924f,
      0.21249877f, -0.08884402f, 0.04751285f};

-  bool run_on_gpu = false;
  const bool linear_before_reset = true;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "reverse", Y_data, linear_before_reset);
+  DefaultActivationsSimpleWeightsWithBias("reverse", Y_data, linear_before_reset);
 }

 // test forward !batch_parallel_ path with linear_before_reset
@ -298,10 +284,9 @@ TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset) {
      0.15024948f, -0.11097029f, -0.02121867f,
      0.19538902f, -0.19016478f, -0.05644283f};

-  bool run_on_gpu = true;
  const bool linear_before_reset = true;
  const bool one_row = true;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "forward", Y_data, linear_before_reset, one_row);
+  DefaultActivationsSimpleWeightsWithBias("forward", Y_data, linear_before_reset, one_row);
 }

 // test reverse !batch_parallel_ path with linear_before_reset
@ -310,10 +295,9 @@ TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset) {
      0.20910699f, -0.18880953f, -0.04005555f,
      0.12252139f, -0.12032216f, -0.05064924f};

-  bool run_on_gpu = false;
  const bool linear_before_reset = true;
  const bool one_row = true;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "reverse", Y_data, linear_before_reset, one_row);
+  DefaultActivationsSimpleWeightsWithBias("reverse", Y_data, linear_before_reset, one_row);
 }

 /*******************
@ -331,8 +315,7 @@ class DeepCpuGruOpTestContext {

  ~DeepCpuGruOpTestContext() = default;

-  void RunTest(bool run_on_gpu,
-               const std::vector<float>& X,
+  void RunTest(const std::vector<float>& X,
               const int batch,
               const int seq_length,
               const std::vector<int>& sequence_length,
@ -467,8 +450,7 @@ DeepCpuGruOpTestContext::DeepCpuGruOpTestContext(const std::string direction,
  }
 }

-void DeepCpuGruOpTestContext::RunTest(bool run_on_gpu,
-                                      const std::vector<float>& X,
+void DeepCpuGruOpTestContext::RunTest(const std::vector<float>& X,
                                      const int batch_size,
                                      const int seq_length,
                                      const std::vector<int>& sequence_lens,
@ -476,7 +458,7 @@ void DeepCpuGruOpTestContext::RunTest(bool run_on_gpu,
                                      const std::vector<float>& expected_Y,
                                      const std::vector<float>& expected_Y_h) {
  // run with and without output_sequence
-  ::onnxruntime::test::RunGruTest(run_on_gpu, X, gru_input_weights_, gru_recurrent_weights_,
+  ::onnxruntime::test::RunGruTest(X, gru_input_weights_, gru_recurrent_weights_,
                                  expected_Y, expected_Y_h,
                                  input_size_, batch_size, hidden_dim_, seq_length,
                                  use_bias_ ? &gru_bias_ : nullptr,
@ -490,7 +472,7 @@ void DeepCpuGruOpTestContext::RunTest(bool run_on_gpu,
                                  alphas_,
                                  betas_);

-  ::onnxruntime::test::RunGruTest(run_on_gpu, X, gru_input_weights_, gru_recurrent_weights_,
+  ::onnxruntime::test::RunGruTest(X, gru_input_weights_, gru_recurrent_weights_,
                                  expected_Y, expected_Y_h,
                                  input_size_, batch_size, hidden_dim_, seq_length,
                                  use_bias_ ? &gru_bias_ : nullptr,
@ -520,8 +502,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpForwardBasic) {
  std::vector<float> expected_Y = {-0.03255286f, 0.0774838f, -0.05556786f, 0.0785508f};
  std::vector<float> expected_Y_h = {-0.05556786f, 0.0785508f};

-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUOpBackwardBasic) {
@ -540,8 +521,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpBackwardBasic) {
                                   -0.03255286f, 0.0774838f};
  std::vector<float> expected_Y_h = {-0.05556786f, 0.0785508f};

-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUOpBidirectionalBasic) {
@ -564,8 +544,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpBidirectionalBasic) {
  std::vector<float> expected_Y_h = {-0.05556786f, 0.0785508f,
                                     -0.05469977f, 0.1004222f};

-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUOpForwardActivation) {
@ -584,8 +563,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpForwardActivation) {
                                   0.3810334f, 0.4944591f};
  std::vector<float> expected_Y_h = {0.3810334f, 0.4944591f};

-  bool run_on_gpu = false;  // cudnn only support activation {sigmoid, tanh}
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUOpForwardInitialHiddenState) {
@ -604,8 +582,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpForwardInitialHiddenState) {
                                   0.07378622f, -0.02782359f};
  std::vector<float> expected_Y_h = {0.07378622f, -0.02782359f};

-  bool run_on_gpu = false;  // cudnn implementation only support linear_before_reset = true
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUOpForwardBatch) {
@ -632,8 +609,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpForwardBatch) {
  std::vector<float> expected_Y_h = {0.07378622f, -0.02782359f,
                                     -0.05556786f, 0.0785508f};

-  bool run_on_gpu = false;  // cudnn implementation only support linear_before_reset = true
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUOpGrowBatchSequenceLength) {
@ -652,8 +628,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpGrowBatchSequenceLength) {
                                   -0.05556786f, 0.0785508f};
  std::vector<float> expected_Y_h = {-0.05556786f, 0.0785508f};

-  bool run_on_gpu = false;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);

  const int batch2 = 2;
  const int seq_length2 = 2;
@ -674,7 +649,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpGrowBatchSequenceLength) {
  std::vector<float> expected_Y_h2 = {0.07378622f, -0.02782359f,
                                      -0.03255286f, 0.0774838f};

-  ctx.RunTest(run_on_gpu, X2, batch2, seq_length2, sequence_length2, &initial_h2, expected_Y2, expected_Y_h2);
+  ctx.RunTest(X2, batch2, seq_length2, sequence_length2, &initial_h2, expected_Y2, expected_Y_h2);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUOpSingleBatchMultipleHiddenThreads) {
@ -704,8 +679,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpSingleBatchMultipleHiddenThreads) {
          0.437727744598091f, 0.451604294166264f, 0.40203814648622f, 0.416614999456787f};
  std::vector<float> expected_Y_h(expected_Y);

-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUPositiveActivationClipping) {
@ -734,8 +708,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUPositiveActivationClipping) {

  std::vector<float> expected_Y_h(expected_Y);

-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 TEST(GRUTest, ONNXRuntime_TestGRUPositiveActivationAlphaBeta) {
@ -776,9 +749,8 @@ TEST(GRUTest, ONNXRuntime_TestGRUPositiveActivationAlphaBeta) {

  std::vector<float> expected_Y_h(expected_Y);

-  bool run_on_gpu = false;  // cudnn implementation don't support the alpha & beta and customized activations
  DeepCpuGruOpTestContext ctx(direction, activations, true, alpha, beta, /*large_hidden*/ true, input_size);
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }

 }  // namespace test
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@ -23,8 +23,7 @@ T DuplicateContainer(const T& container) {
  return doubled;
 }

-static void RunLstmTest(bool run_on_gpu,
-                        const std::vector<float>& X_data,
+static void RunLstmTest(const std::vector<float>& X_data,
                        const std::vector<float>& W_data,
                        const std::vector<float>& R_data,
                        const std::vector<float>& Y_data,
@ -137,15 +136,10 @@ static void RunLstmTest(bool run_on_gpu,
    test.AddMissingOptionalOutput<float>();
  }

-  std::unordered_set<std::string> excluded_providers;
-  if (!run_on_gpu) {
-    excluded_providers.insert(kCudaExecutionProvider);
-  }
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
+  test.Run();
 }

-void SimpleWeightsNoBiasTwoRows(bool run_on_gpu,
-                                std::string direction,
+void SimpleWeightsNoBiasTwoRows(std::string direction,
                                const std::vector<float>& Y_data,
                                const std::vector<float>& Y_h_data,
                                const std::vector<float>& Y_c_data,
@ -171,14 +165,14 @@ void SimpleWeightsNoBiasTwoRows(bool run_on_gpu,
    W_data = DuplicateContainer(W_data);
  }

-  RunLstmTest(run_on_gpu, X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
+  RunLstmTest(X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
              input_size, batch_size, hidden_size, seq_length,
              nullptr, nullptr, nullptr, nullptr, seq_lengths, direction);

  // need at least one output, so we need Y_h or Y_c to be requested (non-empty output to compare against) in order
  // to test Y not being returned (output_sequence == false)
  if (!Y_h_data.empty() || !Y_c_data.empty())
-    RunLstmTest(run_on_gpu, X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
+    RunLstmTest(X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
                input_size, batch_size, hidden_size, seq_length,
                nullptr, nullptr, nullptr, nullptr, seq_lengths, direction, 999.f, /* output_sequence*/ false);
 }
@ -199,11 +193,10 @@ TEST(LSTMTest, ForwardSimpleWeightsNoBiasTwoRows) {
      1.27731147f, 1.44181041f, 1.53179041f,
      1.3249796f, 1.51063104f, 1.61451544f};

-  bool run_on_gpu = true;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "forward", Y_data, Y_h_data, Y_c_data);
+  SimpleWeightsNoBiasTwoRows("forward", Y_data, Y_h_data, Y_c_data);

  // test Y_h and Y_c being optional
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "forward", Y_data, {}, {});
+  SimpleWeightsNoBiasTwoRows("forward", Y_data, {}, {});
 }

 TEST(LSTMTest, ReverseSimpleWeightsNoBiasTwoRows) {
@ -222,8 +215,7 @@ TEST(LSTMTest, ReverseSimpleWeightsNoBiasTwoRows) {
      1.27850552f, 1.46799496f, 1.57641257f,
      1.34960834f, 1.54772296f, 1.65633056f};

-  bool run_on_gpu = true;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "reverse", Y_data, Y_h_data, Y_c_data);
+  SimpleWeightsNoBiasTwoRows("reverse", Y_data, Y_h_data, Y_c_data);
 }

 TEST(LSTMTest, BidirectionalSimpleWeightsNoBiasTwoRows) {
@ -257,8 +249,7 @@ TEST(LSTMTest, BidirectionalSimpleWeightsNoBiasTwoRows) {
      1.34960834f, 1.54772296f, 1.65633056f};

  // cudnn don't support customized activation
-  bool run_on_gpu = true;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "bidirectional", Y_data, Y_h_data, Y_c_data);
+  SimpleWeightsNoBiasTwoRows("bidirectional", Y_data, Y_h_data, Y_c_data);
 }

 TEST(LSTMTest, MixedSequenceLengths) {
@ -282,8 +273,7 @@ TEST(LSTMTest, MixedSequenceLengths) {
      1.3249796f, 1.51063104f, 1.61451544f};

  // Not able to mask on Y_c for CUDA using cudnn lib
-  bool run_on_gpu = false;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "forward", Y_data, Y_h_data, Y_c_data, &seq_lengths);
+  SimpleWeightsNoBiasTwoRows("forward", Y_data, Y_h_data, Y_c_data, &seq_lengths);

  // swap which one is short
  seq_lengths = {2, 1};
@ -303,7 +293,7 @@ TEST(LSTMTest, MixedSequenceLengths) {
      1.27731147f, 1.44181041f, 1.53179041f,
      0.54983425f, 0.59868795f, 0.64565659f};

-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "forward", Y_data, Y_h_data, Y_c_data, &seq_lengths);
+  SimpleWeightsNoBiasTwoRows("forward", Y_data, Y_h_data, Y_c_data, &seq_lengths);
 }

 TEST(LSTMTest, MixedSequenceLengthsReverse) {
@ -326,8 +316,7 @@ TEST(LSTMTest, MixedSequenceLengthsReverse) {
      0.52497941f, 0.54983425f, 0.5744428f,
      1.34960834f, 1.54772296f, 1.65633056f};

-  bool run_on_gpu = false;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "reverse", Y_data, Y_h_data, Y_c_data, &seq_lengths);
+  SimpleWeightsNoBiasTwoRows("reverse", Y_data, Y_h_data, Y_c_data, &seq_lengths);

  // swap which one is short
  seq_lengths = {2, 1};
@ -347,7 +336,7 @@ TEST(LSTMTest, MixedSequenceLengthsReverse) {
      1.27850552f, 1.46799496f, 1.57641257f,
      0.54983425f, 0.59868795f, 0.64565659f};

-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "reverse", Y_data, Y_h_data, Y_c_data, &seq_lengths);
+  SimpleWeightsNoBiasTwoRows("reverse", Y_data, Y_h_data, Y_c_data, &seq_lengths);
 }

 // test path in LSTM model where batch_parallel_ is false and there are multiple steps (seq_length > 1)
@ -374,13 +363,12 @@ TEST(LSTMTest, BatchParallelFalseSeqLengthGreaterThanOne) {
  std::vector<float> Y_c_data{
      1.02721067f, 1.15254318f};

-  bool run_on_gpu = true;
-  RunLstmTest(run_on_gpu, X_data, W_data, R_data, Y_data, {}, Y_c_data,
+  RunLstmTest(X_data, W_data, R_data, Y_data, {}, Y_c_data,
              input_size, batch_size, hidden_size, seq_length);
 }

 // make sure GateComputations works correctly if batch_parallel_ is true due to large batch size
-static void LargeBatchWithClip(bool run_on_gpu, const std::vector<float>& Y_h_data, float clip = 9999.0) {
+static void LargeBatchWithClip(const std::vector<float>& Y_h_data, float clip = 9999.0) {
  int64_t seq_length = 2;
  int batch_size = 32;
  int64_t input_size = 1;
@ -401,7 +389,7 @@ static void LargeBatchWithClip(bool run_on_gpu, const std::vector<float>& Y_h_da

  std::vector<float> R_data(num_directions * 4 * hidden_size * hidden_size, 0.1f);

-  RunLstmTest(run_on_gpu, X_data, W_data, R_data, {}, Y_h_data, {},
+  RunLstmTest(X_data, W_data, R_data, {}, Y_h_data, {},
              input_size, batch_size, hidden_size, seq_length,
              nullptr, nullptr, nullptr, nullptr, nullptr, direction, clip);
 }
@ -441,8 +429,7 @@ TEST(LSTMTest, LargeBatchNoClipping) {
      0.96073964f, 0.96388402f, 0.96402112f,
      0.96105254f, 0.96391004f, 0.96402279f};

-  bool run_on_gpu = true;
-  LargeBatchWithClip(run_on_gpu, Y_h_data);
+  LargeBatchWithClip(Y_h_data);
 }

 // make sure GateComputations with clipping works correctly if batch_parallel_ is true due to large batch size
@ -481,8 +468,7 @@ TEST(LSTMTest, LargeBatchWithClip) {
      0.94072091f, 0.94266769f, 0.94266769f,
      0.94103248f, 0.94266769f, 0.94266769f};

-  bool run_on_gpu = false;
-  LargeBatchWithClip(run_on_gpu, Y_h_data, 4.f);
+  LargeBatchWithClip(Y_h_data, 4.f);
 }

 // ONNXRuntime tests
@ -608,8 +594,7 @@ class LstmOpContext2x1x2x2 {
    // RunTest(seq_len, batch_size, num_direction, Y_data, output_first);
  }

-  void RunTest(bool run_on_gpu,
-               const std::vector<float>& X,
+  void RunTest(const std::vector<float>& X,
               const int batch_size,
               const int seq_length,
               const std::vector<float>* initial_h,
@ -623,7 +608,7 @@ class LstmOpContext2x1x2x2 {
               float clip = 9999.f,
               bool input_forget = false) {
    // run with and without output_sequence to test UniDirectionalLstm handling when Y isn't returned
-    ::onnxruntime::test::RunLstmTest(run_on_gpu, X, input_weights_, recurrent_weights_,
+    ::onnxruntime::test::RunLstmTest(X, input_weights_, recurrent_weights_,
                                     expected_Y, expected_Y_h, expected_Y_c,
                                     input_size_, batch_size, hidden_size_, seq_length,
                                     use_bias ? &bias_ : nullptr,
@ -638,7 +623,7 @@ class LstmOpContext2x1x2x2 {
                                     activation_alphas_,
                                     activation_betas_);

-    ::onnxruntime::test::RunLstmTest(run_on_gpu, X, input_weights_, recurrent_weights_,
+    ::onnxruntime::test::RunLstmTest(X, input_weights_, recurrent_weights_,
                                     expected_Y, expected_Y_h, expected_Y_c,
                                     input_size_, batch_size, hidden_size_, seq_length,
                                     use_bias ? &bias_ : nullptr,
@ -681,8 +666,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardPeepHole) {

  //Run Test
  LstmOpContext2x1x2x2 context(direction);
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, input, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
+  context.RunTest(input, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
 }

 TEST(LSTMTest, ONNXRuntime_TestLSTMBidirectionalBasic) {
@ -700,8 +684,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBidirectionalBasic) {
                                 -0.0753684f, 0.120794f};

  LstmOpContext2x1x2x2 context("bidirectional");
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
 }

 TEST(LSTMTest, ONNXRuntime_TestLSTMForwardNoBiasUsePeepholes) {
@ -718,8 +701,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardNoBiasUsePeepholes) {
  std::vector<float> Y_c_data = {0.11169686f, 0.00625722f};

  LstmOpContext2x1x2x2 context("forward");
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
                  use_bias, use_peepholes);
 }

@ -740,8 +722,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardInputForget) {

  LstmOpContext2x1x2x2 context("forward");
  // cudnn don't support peepholes
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
                  use_bias, use_peepholes, clip, input_forget);
 }

@ -760,8 +741,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardClip) {
  std::vector<float> Y_c_data = {-0.07415761f, 0.07395997f};

  LstmOpContext2x1x2x2 context("forward");
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
                  use_bias, use_peepholes, clip);
 }

@ -776,8 +756,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBackward) {
  std::vector<float> Y_c_data = {-0.07536839f, 0.12079399f};

  LstmOpContext2x1x2x2 context("reverse");
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
 }

 TEST(LSTMTest, ONNXRuntime_TestLSTMBackward_gpu) {
@ -791,9 +770,8 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBackward_gpu) {
  std::vector<float> Y_c_data = {-0.076699793f, 0.11975205f};

  LstmOpContext2x1x2x2 context("reverse");
-  bool run_on_gpu = true;
  // Disable peephole since cudnn doesn't support it
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr, true, false);
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr, true, false);
 }

 TEST(LSTMTest, ONNXRuntime_TestLSTMForwardHiddenState) {
@ -811,8 +789,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardHiddenState) {
  std::vector<float> Y_c_data = {-0.07285583f, -0.02545788f};

  LstmOpContext2x1x2x2 context("forward");
-  bool run_on_gpu = true;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, &hidden_state, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, &hidden_state, nullptr, Y_data, Y_h_data, Y_c_data,
                  nullptr, use_bias, use_peepholes);
 }

@ -832,8 +809,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardCellState) {
  std::vector<float> Y_c_data = {0.06408449f, 0.03139432f};

  LstmOpContext2x1x2x2 context("forward");
-  bool run_on_gpu = true;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, &hidden_state, &cell_state, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, &hidden_state, &cell_state, Y_data, Y_h_data, Y_c_data,
                  nullptr, use_bias, use_peepholes);
 }

@ -853,8 +829,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMActivation) {
  std::vector<float> Y_c_data = {0.1624992f, 0.04672481f};

  LstmOpContext2x1x2x2 context("forward", activations);
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                  nullptr, use_bias, use_peepholes);
 }

@ -882,8 +857,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBatchReallocation) {
  std::vector<float> Y_c_data = {0.1624992f, 0.04672481f};

  LstmOpContext2x1x2x2 context(direction, activations);
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                  nullptr, use_bias, use_peepholes);

  batch_size = 3;
@ -912,7 +886,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBatchReallocation) {
              0.23038f, -0.0239f,
              0.24572f, 0.051626f};

-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                  nullptr, use_bias, use_peepholes);
 }

@ -945,8 +919,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMOutputWrite) {

  std::string direction = "bidirectional";
  LstmOpContext2x1x2x2 context(direction, activations);
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                  nullptr, use_bias, use_peepholes);

  batch_size = 3;
@ -992,7 +965,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMOutputWrite) {
              0.22469461f, -0.02200207f,
              0.18284359f, -0.01078442f};

-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                  nullptr, use_bias, use_peepholes);
 }