From 900e69ceae1e6d3ebf6dffd47729eb3c43baf647 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Mon, 3 Dec 2018 14:21:23 -0800
Subject: [PATCH] User lower case while comparing the activation method as it's
 not clear in the spec. (#77)

Fix the bug for RNN/GRU/LSTM auto fall back, the supported activation should cover bidirectional mode.
Update the test to remove the flag to enable gpu test since we can auto fallback to cpu
---
 .../providers/cuda/cuda_execution_provider.cc | 12 ++-
 .../providers/cpu/rnn/deep_cpu_gru_op_test.cc | 90 ++++++-----------
 .../cpu/rnn/deep_cpu_lstm_op_test.cc          | 97 +++++++------------
 3 files changed, 74 insertions(+), 125 deletions(-)
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index cdb7ad9a40..3132972643 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -784,7 +784,10 @@ bool CUDAExecutionProvider::RNNNeedFallbackToCPU(const onnxruntime::Node& node,
     if ("activations" == attr_name &&
         ::onnx::AttributeProto_AttributeType::AttributeProto_AttributeType_STRINGS == attr_value.type()) {
       for (int i = 0; i < attr_value.strings_size(); ++i) {
-        if (activations_supported[i] != attr_value.strings(i)) {
+        std::string activation_lowercase(attr_value.strings(i));
+        std::transform(activation_lowercase.begin(), activation_lowercase.end(), activation_lowercase.begin(),
+                       [](const unsigned char i) { return static_cast<char>(::tolower(i)); });
+        if (activations_supported[i] != activation_lowercase) {
           return true;
         }
       }
@@ -829,13 +832,14 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   for (auto& node : graph.Nodes()) {
     bool fallback_to_cpu_provider = false;
     if ("LSTM" == node.OpType()) {
-      std::vector<std::string> activations_supported{"Sigmoid", "Tanh", "Tanh"};
+      // the supported activations covers the bidirectional mode
+      std::vector<std::string> activations_supported{"sigmoid", "tanh", "tanh", "sigmoid", "tanh", "tanh"};
       fallback_to_cpu_provider = RNNNeedFallbackToCPU(node, activations_supported, node.OpType());
     } else if ("RNN" == node.OpType()) {
-      std::vector<std::string> activations_supported{"Tanh", "Tanh"};
+      std::vector<std::string> activations_supported{"tanh", "tanh"};
       fallback_to_cpu_provider = RNNNeedFallbackToCPU(node, activations_supported, node.OpType());
     } else if ("GRU" == node.OpType()) {
-      std::vector<std::string> activations_supported{"Sigmoid", "Tanh"};
+      std::vector<std::string> activations_supported{"sigmoid", "tanh", "sigmoid", "tanh"};
       fallback_to_cpu_provider = RNNNeedFallbackToCPU(node, activations_supported, node.OpType());
     }
 
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc
index 607b0fbc73..0e1cba2e71 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc
@@ -12,8 +12,7 @@ using namespace std;
 namespace onnxruntime {
 namespace test {
 
-static void RunGruTest(bool run_on_gpu,
-                       const std::vector<float>& X_data,
+static void RunGruTest(const std::vector<float>& X_data,
                        const std::vector<float>& W_data,
                        const std::vector<float>& R_data,
                        const std::vector<float>& Y_data,
@@ -95,16 +94,10 @@ static void RunGruTest(bool run_on_gpu,
   } else {
     test.AddMissingOptionalOutput<float>();
   }
-
-  std::unordered_set<std::string> excluded_provider_types;
-  if (!run_on_gpu) {
-    excluded_provider_types.insert(kCudaExecutionProvider);
-  }
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_provider_types);
+  test.Run();
 }
 
-void DefaultActivationsSimpleWeightsNoBias(bool run_on_gpu,
-                                           std::string direction,
+void DefaultActivationsSimpleWeightsNoBias(std::string direction,
                                            const std::vector<float>& Y_data,
                                            const std::vector<float>& Y_h_data) {
   int64_t seq_length = 2;
@@ -129,13 +122,13 @@ void DefaultActivationsSimpleWeightsNoBias(bool run_on_gpu,
 
   std::vector<float> R_data(num_directions * 3 * hidden_size * hidden_size, 0.1f);
 
-  RunGruTest(run_on_gpu, X_data, W_data, R_data, Y_data, Y_h_data, input_size, batch_size, hidden_size, seq_length,
+  RunGruTest(X_data, W_data, R_data, Y_data, Y_h_data, input_size, batch_size, hidden_size, seq_length,
              nullptr, nullptr, nullptr, direction);
 
   // if Y_h_data is empty that tests Y_h not being returned. we need to have at least one output or
   // the node will get removed, so only test with output_sequence == false (no Y as output) if Y_h is not optional
   if (!Y_h_data.empty())
-    RunGruTest(run_on_gpu, X_data, W_data, R_data, Y_data, Y_h_data, input_size, batch_size, hidden_size, seq_length,
+    RunGruTest(X_data, W_data, R_data, Y_data, Y_h_data, input_size, batch_size, hidden_size, seq_length,
                nullptr, nullptr, nullptr, direction, 9999.0, /* output_sequence*/ false);
 }
 
@@ -151,11 +144,10 @@ TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows) {
       0.6027093f, 0.5083023f, 0.44950223f,
       0.5754369f, 0.45485455f, 0.3747841f};
 
-  bool run_on_gpu = true;
-  DefaultActivationsSimpleWeightsNoBias(run_on_gpu, "forward", Y_data, Y_h_data);
+  DefaultActivationsSimpleWeightsNoBias("forward", Y_data, Y_h_data);
 
   // test Y_h not being returned
-  DefaultActivationsSimpleWeightsNoBias(run_on_gpu, "forward", Y_data, {});
+  DefaultActivationsSimpleWeightsNoBias("forward", Y_data, {});
 }
 
 TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows) {
@@ -170,8 +162,7 @@ TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows) {
       0.6082785f, 0.50623393f, 0.4426924f,
       0.5803454f, 0.4527356f, 0.36886263f};
 
-  bool run_on_gpu = false;  // cudnn implementation only support linear_before_reset = true
-  DefaultActivationsSimpleWeightsNoBias(run_on_gpu, "reverse", Y_data, Y_h_data);
+  DefaultActivationsSimpleWeightsNoBias("reverse", Y_data, Y_h_data);
 }
 
 TEST(GRUTest, BidirectionalDefaultActivationsSimpleWeightsNoBiasTwoRows) {
@@ -201,12 +192,10 @@ TEST(GRUTest, BidirectionalDefaultActivationsSimpleWeightsNoBiasTwoRows) {
       0.6082785f, 0.50623393f, 0.4426924f,
       0.5803454f, 0.4527356f, 0.36886263f};
 
-  bool run_on_gpu = true;
-  DefaultActivationsSimpleWeightsNoBias(run_on_gpu, "bidirectional", Y_data, Y_h_data);
+  DefaultActivationsSimpleWeightsNoBias("bidirectional", Y_data, Y_h_data);
 }
 
-void DefaultActivationsSimpleWeightsWithBias(bool run_on_gpu,
-                                             std::string direction,
+void DefaultActivationsSimpleWeightsWithBias(std::string direction,
                                              const std::vector<float>& Y_data,
                                              bool linear_before_reset = false,
                                              bool one_row = false) {
@@ -250,7 +239,7 @@ void DefaultActivationsSimpleWeightsWithBias(bool run_on_gpu,
 
   std::vector<float> R_data(num_directions * 3 * hidden_size * hidden_size, 0.1f);
 
-  RunGruTest(run_on_gpu, X_data, W_data, R_data, Y_data, {}, input_size, batch_size, hidden_size, seq_length,
+  RunGruTest(X_data, W_data, R_data, Y_data, {}, input_size, batch_size, hidden_size, seq_length,
              &B_data, nullptr, nullptr, direction, 999.f, /* output_sequence*/ true, linear_before_reset);
 }  // namespace test
 
@@ -262,8 +251,7 @@ TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallel) {
       0.22688604f, -0.19698407f, 0.14017843f,
       0.33386092f, -0.15799662f, 0.2381169f};
 
-  bool run_on_gpu = false;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "forward", Y_data);
+  DefaultActivationsSimpleWeightsWithBias("forward", Y_data);
 }
 
 TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset) {
@@ -274,9 +262,8 @@ TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearB
       0.19538902f, -0.19016478f, -0.05644283f,
       0.30856851f, -0.15190377f, 0.05999807f};
 
-  bool run_on_gpu = true;
   const bool linear_before_reset = true;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "forward", Y_data, linear_before_reset);
+  DefaultActivationsSimpleWeightsWithBias("forward", Y_data, linear_before_reset);
 }
 
 TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset) {
@@ -287,9 +274,8 @@ TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearB
       0.12252139f, -0.12032216f, -0.05064924f,
       0.21249877f, -0.08884402f, 0.04751285f};
 
-  bool run_on_gpu = false;
   const bool linear_before_reset = true;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "reverse", Y_data, linear_before_reset);
+  DefaultActivationsSimpleWeightsWithBias("reverse", Y_data, linear_before_reset);
 }
 
 // test forward !batch_parallel_ path with linear_before_reset
@@ -298,10 +284,9 @@ TEST(GRUTest, ForwardDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset) {
       0.15024948f, -0.11097029f, -0.02121867f,
       0.19538902f, -0.19016478f, -0.05644283f};
 
-  bool run_on_gpu = true;
   const bool linear_before_reset = true;
   const bool one_row = true;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "forward", Y_data, linear_before_reset, one_row);
+  DefaultActivationsSimpleWeightsWithBias("forward", Y_data, linear_before_reset, one_row);
 }
 
 // test reverse !batch_parallel_ path with linear_before_reset
@@ -310,10 +295,9 @@ TEST(GRUTest, ReverseDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset) {
       0.20910699f, -0.18880953f, -0.04005555f,
       0.12252139f, -0.12032216f, -0.05064924f};
 
-  bool run_on_gpu = false;
   const bool linear_before_reset = true;
   const bool one_row = true;
-  DefaultActivationsSimpleWeightsWithBias(run_on_gpu, "reverse", Y_data, linear_before_reset, one_row);
+  DefaultActivationsSimpleWeightsWithBias("reverse", Y_data, linear_before_reset, one_row);
 }
 
 /*******************
@@ -331,8 +315,7 @@ class DeepCpuGruOpTestContext {
 
   ~DeepCpuGruOpTestContext() = default;
 
-  void RunTest(bool run_on_gpu,
-               const std::vector<float>& X,
+  void RunTest(const std::vector<float>& X,
                const int batch,
                const int seq_length,
                const std::vector<int>& sequence_length,
@@ -467,8 +450,7 @@ DeepCpuGruOpTestContext::DeepCpuGruOpTestContext(const std::string direction,
   }
 }
 
-void DeepCpuGruOpTestContext::RunTest(bool run_on_gpu,
-                                      const std::vector<float>& X,
+void DeepCpuGruOpTestContext::RunTest(const std::vector<float>& X,
                                       const int batch_size,
                                       const int seq_length,
                                       const std::vector<int>& sequence_lens,
@@ -476,7 +458,7 @@ void DeepCpuGruOpTestContext::RunTest(bool run_on_gpu,
                                       const std::vector<float>& expected_Y,
                                       const std::vector<float>& expected_Y_h) {
   // run with and without output_sequence
-  ::onnxruntime::test::RunGruTest(run_on_gpu, X, gru_input_weights_, gru_recurrent_weights_,
+  ::onnxruntime::test::RunGruTest(X, gru_input_weights_, gru_recurrent_weights_,
                                   expected_Y, expected_Y_h,
                                   input_size_, batch_size, hidden_dim_, seq_length,
                                   use_bias_ ? &gru_bias_ : nullptr,
@@ -490,7 +472,7 @@ void DeepCpuGruOpTestContext::RunTest(bool run_on_gpu,
                                   alphas_,
                                   betas_);
 
-  ::onnxruntime::test::RunGruTest(run_on_gpu, X, gru_input_weights_, gru_recurrent_weights_,
+  ::onnxruntime::test::RunGruTest(X, gru_input_weights_, gru_recurrent_weights_,
                                   expected_Y, expected_Y_h,
                                   input_size_, batch_size, hidden_dim_, seq_length,
                                   use_bias_ ? &gru_bias_ : nullptr,
@@ -520,8 +502,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpForwardBasic) {
   std::vector<float> expected_Y = {-0.03255286f, 0.0774838f, -0.05556786f, 0.0785508f};
   std::vector<float> expected_Y_h = {-0.05556786f, 0.0785508f};
 
-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUOpBackwardBasic) {
@@ -540,8 +521,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpBackwardBasic) {
                                    -0.03255286f, 0.0774838f};
   std::vector<float> expected_Y_h = {-0.05556786f, 0.0785508f};
 
-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUOpBidirectionalBasic) {
@@ -564,8 +544,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpBidirectionalBasic) {
   std::vector<float> expected_Y_h = {-0.05556786f, 0.0785508f,
                                      -0.05469977f, 0.1004222f};
 
-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUOpForwardActivation) {
@@ -584,8 +563,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpForwardActivation) {
                                    0.3810334f, 0.4944591f};
   std::vector<float> expected_Y_h = {0.3810334f, 0.4944591f};
 
-  bool run_on_gpu = false;  // cudnn only support activation {sigmoid, tanh}
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUOpForwardInitialHiddenState) {
@@ -604,8 +582,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpForwardInitialHiddenState) {
                                    0.07378622f, -0.02782359f};
   std::vector<float> expected_Y_h = {0.07378622f, -0.02782359f};
 
-  bool run_on_gpu = false;  // cudnn implementation only support linear_before_reset = true
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUOpForwardBatch) {
@@ -632,8 +609,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpForwardBatch) {
   std::vector<float> expected_Y_h = {0.07378622f, -0.02782359f,
                                      -0.05556786f, 0.0785508f};
 
-  bool run_on_gpu = false;  // cudnn implementation only support linear_before_reset = true
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUOpGrowBatchSequenceLength) {
@@ -652,8 +628,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpGrowBatchSequenceLength) {
                                    -0.05556786f, 0.0785508f};
   std::vector<float> expected_Y_h = {-0.05556786f, 0.0785508f};
 
-  bool run_on_gpu = false;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 
   const int batch2 = 2;
   const int seq_length2 = 2;
@@ -674,7 +649,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpGrowBatchSequenceLength) {
   std::vector<float> expected_Y_h2 = {0.07378622f, -0.02782359f,
                                       -0.03255286f, 0.0774838f};
 
-  ctx.RunTest(run_on_gpu, X2, batch2, seq_length2, sequence_length2, &initial_h2, expected_Y2, expected_Y_h2);
+  ctx.RunTest(X2, batch2, seq_length2, sequence_length2, &initial_h2, expected_Y2, expected_Y_h2);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUOpSingleBatchMultipleHiddenThreads) {
@@ -704,8 +679,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpSingleBatchMultipleHiddenThreads) {
           0.437727744598091f, 0.451604294166264f, 0.40203814648622f, 0.416614999456787f};
   std::vector<float> expected_Y_h(expected_Y);
 
-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUPositiveActivationClipping) {
@@ -734,8 +708,7 @@ TEST(GRUTest, ONNXRuntime_TestGRUPositiveActivationClipping) {
 
   std::vector<float> expected_Y_h(expected_Y);
 
-  bool run_on_gpu = true;
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 TEST(GRUTest, ONNXRuntime_TestGRUPositiveActivationAlphaBeta) {
@@ -776,9 +749,8 @@ TEST(GRUTest, ONNXRuntime_TestGRUPositiveActivationAlphaBeta) {
 
   std::vector<float> expected_Y_h(expected_Y);
 
-  bool run_on_gpu = false;  // cudnn implementation don't support the alpha & beta and customized activations
   DeepCpuGruOpTestContext ctx(direction, activations, true, alpha, beta, /*large_hidden*/ true, input_size);
-  ctx.RunTest(run_on_gpu, X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
+  ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h);
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
index e90baafdae..153d4ef603 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@@ -23,8 +23,7 @@ T DuplicateContainer(const T& container) {
   return doubled;
 }
 
-static void RunLstmTest(bool run_on_gpu,
-                        const std::vector<float>& X_data,
+static void RunLstmTest(const std::vector<float>& X_data,
                         const std::vector<float>& W_data,
                         const std::vector<float>& R_data,
                         const std::vector<float>& Y_data,
@@ -137,15 +136,10 @@ static void RunLstmTest(bool run_on_gpu,
     test.AddMissingOptionalOutput<float>();
   }
 
-  std::unordered_set<std::string> excluded_providers;
-  if (!run_on_gpu) {
-    excluded_providers.insert(kCudaExecutionProvider);
-  }
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
+  test.Run();
 }
 
-void SimpleWeightsNoBiasTwoRows(bool run_on_gpu,
-                                std::string direction,
+void SimpleWeightsNoBiasTwoRows(std::string direction,
                                 const std::vector<float>& Y_data,
                                 const std::vector<float>& Y_h_data,
                                 const std::vector<float>& Y_c_data,
@@ -171,14 +165,14 @@ void SimpleWeightsNoBiasTwoRows(bool run_on_gpu,
     W_data = DuplicateContainer(W_data);
   }
 
-  RunLstmTest(run_on_gpu, X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
+  RunLstmTest(X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
               input_size, batch_size, hidden_size, seq_length,
               nullptr, nullptr, nullptr, nullptr, seq_lengths, direction);
 
   // need at least one output, so we need Y_h or Y_c to be requested (non-empty output to compare against) in order
   // to test Y not being returned (output_sequence == false)
   if (!Y_h_data.empty() || !Y_c_data.empty())
-    RunLstmTest(run_on_gpu, X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
+    RunLstmTest(X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
                 input_size, batch_size, hidden_size, seq_length,
                 nullptr, nullptr, nullptr, nullptr, seq_lengths, direction, 999.f, /* output_sequence*/ false);
 }
@@ -199,11 +193,10 @@ TEST(LSTMTest, ForwardSimpleWeightsNoBiasTwoRows) {
       1.27731147f, 1.44181041f, 1.53179041f,
       1.3249796f, 1.51063104f, 1.61451544f};
 
-  bool run_on_gpu = true;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "forward", Y_data, Y_h_data, Y_c_data);
+  SimpleWeightsNoBiasTwoRows("forward", Y_data, Y_h_data, Y_c_data);
 
   // test Y_h and Y_c being optional
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "forward", Y_data, {}, {});
+  SimpleWeightsNoBiasTwoRows("forward", Y_data, {}, {});
 }
 
 TEST(LSTMTest, ReverseSimpleWeightsNoBiasTwoRows) {
@@ -222,8 +215,7 @@ TEST(LSTMTest, ReverseSimpleWeightsNoBiasTwoRows) {
       1.27850552f, 1.46799496f, 1.57641257f,
       1.34960834f, 1.54772296f, 1.65633056f};
 
-  bool run_on_gpu = true;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "reverse", Y_data, Y_h_data, Y_c_data);
+  SimpleWeightsNoBiasTwoRows("reverse", Y_data, Y_h_data, Y_c_data);
 }
 
 TEST(LSTMTest, BidirectionalSimpleWeightsNoBiasTwoRows) {
@@ -257,8 +249,7 @@ TEST(LSTMTest, BidirectionalSimpleWeightsNoBiasTwoRows) {
       1.34960834f, 1.54772296f, 1.65633056f};
 
   // cudnn don't support customized activation
-  bool run_on_gpu = true;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "bidirectional", Y_data, Y_h_data, Y_c_data);
+  SimpleWeightsNoBiasTwoRows("bidirectional", Y_data, Y_h_data, Y_c_data);
 }
 
 TEST(LSTMTest, MixedSequenceLengths) {
@@ -282,8 +273,7 @@ TEST(LSTMTest, MixedSequenceLengths) {
       1.3249796f, 1.51063104f, 1.61451544f};
 
   // Not able to mask on Y_c for CUDA using cudnn lib
-  bool run_on_gpu = false;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "forward", Y_data, Y_h_data, Y_c_data, &seq_lengths);
+  SimpleWeightsNoBiasTwoRows("forward", Y_data, Y_h_data, Y_c_data, &seq_lengths);
 
   // swap which one is short
   seq_lengths = {2, 1};
@@ -303,7 +293,7 @@ TEST(LSTMTest, MixedSequenceLengths) {
       1.27731147f, 1.44181041f, 1.53179041f,
       0.54983425f, 0.59868795f, 0.64565659f};
 
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "forward", Y_data, Y_h_data, Y_c_data, &seq_lengths);
+  SimpleWeightsNoBiasTwoRows("forward", Y_data, Y_h_data, Y_c_data, &seq_lengths);
 }
 
 TEST(LSTMTest, MixedSequenceLengthsReverse) {
@@ -326,8 +316,7 @@ TEST(LSTMTest, MixedSequenceLengthsReverse) {
       0.52497941f, 0.54983425f, 0.5744428f,
       1.34960834f, 1.54772296f, 1.65633056f};
 
-  bool run_on_gpu = false;
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "reverse", Y_data, Y_h_data, Y_c_data, &seq_lengths);
+  SimpleWeightsNoBiasTwoRows("reverse", Y_data, Y_h_data, Y_c_data, &seq_lengths);
 
   // swap which one is short
   seq_lengths = {2, 1};
@@ -347,7 +336,7 @@ TEST(LSTMTest, MixedSequenceLengthsReverse) {
       1.27850552f, 1.46799496f, 1.57641257f,
       0.54983425f, 0.59868795f, 0.64565659f};
 
-  SimpleWeightsNoBiasTwoRows(run_on_gpu, "reverse", Y_data, Y_h_data, Y_c_data, &seq_lengths);
+  SimpleWeightsNoBiasTwoRows("reverse", Y_data, Y_h_data, Y_c_data, &seq_lengths);
 }
 
 // test path in LSTM model where batch_parallel_ is false and there are multiple steps (seq_length > 1)
@@ -374,13 +363,12 @@ TEST(LSTMTest, BatchParallelFalseSeqLengthGreaterThanOne) {
   std::vector<float> Y_c_data{
       1.02721067f, 1.15254318f};
 
-  bool run_on_gpu = true;
-  RunLstmTest(run_on_gpu, X_data, W_data, R_data, Y_data, {}, Y_c_data,
+  RunLstmTest(X_data, W_data, R_data, Y_data, {}, Y_c_data,
               input_size, batch_size, hidden_size, seq_length);
 }
 
 // make sure GateComputations works correctly if batch_parallel_ is true due to large batch size
-static void LargeBatchWithClip(bool run_on_gpu, const std::vector<float>& Y_h_data, float clip = 9999.0) {
+static void LargeBatchWithClip(const std::vector<float>& Y_h_data, float clip = 9999.0) {
   int64_t seq_length = 2;
   int batch_size = 32;
   int64_t input_size = 1;
@@ -401,7 +389,7 @@ static void LargeBatchWithClip(bool run_on_gpu, const std::vector<float>& Y_h_da
 
   std::vector<float> R_data(num_directions * 4 * hidden_size * hidden_size, 0.1f);
 
-  RunLstmTest(run_on_gpu, X_data, W_data, R_data, {}, Y_h_data, {},
+  RunLstmTest(X_data, W_data, R_data, {}, Y_h_data, {},
               input_size, batch_size, hidden_size, seq_length,
               nullptr, nullptr, nullptr, nullptr, nullptr, direction, clip);
 }
@@ -441,8 +429,7 @@ TEST(LSTMTest, LargeBatchNoClipping) {
       0.96073964f, 0.96388402f, 0.96402112f,
       0.96105254f, 0.96391004f, 0.96402279f};
 
-  bool run_on_gpu = true;
-  LargeBatchWithClip(run_on_gpu, Y_h_data);
+  LargeBatchWithClip(Y_h_data);
 }
 
 // make sure GateComputations with clipping works correctly if batch_parallel_ is true due to large batch size
@@ -481,8 +468,7 @@ TEST(LSTMTest, LargeBatchWithClip) {
       0.94072091f, 0.94266769f, 0.94266769f,
       0.94103248f, 0.94266769f, 0.94266769f};
 
-  bool run_on_gpu = false;
-  LargeBatchWithClip(run_on_gpu, Y_h_data, 4.f);
+  LargeBatchWithClip(Y_h_data, 4.f);
 }
 
 // ONNXRuntime tests
@@ -608,8 +594,7 @@ class LstmOpContext2x1x2x2 {
     // RunTest(seq_len, batch_size, num_direction, Y_data, output_first);
   }
 
-  void RunTest(bool run_on_gpu,
-               const std::vector<float>& X,
+  void RunTest(const std::vector<float>& X,
                const int batch_size,
                const int seq_length,
                const std::vector<float>* initial_h,
@@ -623,7 +608,7 @@ class LstmOpContext2x1x2x2 {
                float clip = 9999.f,
                bool input_forget = false) {
     // run with and without output_sequence to test UniDirectionalLstm handling when Y isn't returned
-    ::onnxruntime::test::RunLstmTest(run_on_gpu, X, input_weights_, recurrent_weights_,
+    ::onnxruntime::test::RunLstmTest(X, input_weights_, recurrent_weights_,
                                      expected_Y, expected_Y_h, expected_Y_c,
                                      input_size_, batch_size, hidden_size_, seq_length,
                                      use_bias ? &bias_ : nullptr,
@@ -638,7 +623,7 @@ class LstmOpContext2x1x2x2 {
                                      activation_alphas_,
                                      activation_betas_);
 
-    ::onnxruntime::test::RunLstmTest(run_on_gpu, X, input_weights_, recurrent_weights_,
+    ::onnxruntime::test::RunLstmTest(X, input_weights_, recurrent_weights_,
                                      expected_Y, expected_Y_h, expected_Y_c,
                                      input_size_, batch_size, hidden_size_, seq_length,
                                      use_bias ? &bias_ : nullptr,
@@ -681,8 +666,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardPeepHole) {
 
   //Run Test
   LstmOpContext2x1x2x2 context(direction);
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, input, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
+  context.RunTest(input, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
 }
 
 TEST(LSTMTest, ONNXRuntime_TestLSTMBidirectionalBasic) {
@@ -700,8 +684,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBidirectionalBasic) {
                                  -0.0753684f, 0.120794f};
 
   LstmOpContext2x1x2x2 context("bidirectional");
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
 }
 
 TEST(LSTMTest, ONNXRuntime_TestLSTMForwardNoBiasUsePeepholes) {
@@ -718,8 +701,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardNoBiasUsePeepholes) {
   std::vector<float> Y_c_data = {0.11169686f, 0.00625722f};
 
   LstmOpContext2x1x2x2 context("forward");
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
                   use_bias, use_peepholes);
 }
 
@@ -740,8 +722,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardInputForget) {
 
   LstmOpContext2x1x2x2 context("forward");
   // cudnn don't support peepholes
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
                   use_bias, use_peepholes, clip, input_forget);
 }
 
@@ -760,8 +741,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardClip) {
   std::vector<float> Y_c_data = {-0.07415761f, 0.07395997f};
 
   LstmOpContext2x1x2x2 context("forward");
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr,
                   use_bias, use_peepholes, clip);
 }
 
@@ -776,8 +756,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBackward) {
   std::vector<float> Y_c_data = {-0.07536839f, 0.12079399f};
 
   LstmOpContext2x1x2x2 context("reverse");
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data);
 }
 
 TEST(LSTMTest, ONNXRuntime_TestLSTMBackward_gpu) {
@@ -791,9 +770,8 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBackward_gpu) {
   std::vector<float> Y_c_data = {-0.076699793f, 0.11975205f};
 
   LstmOpContext2x1x2x2 context("reverse");
-  bool run_on_gpu = true;
   // Disable peephole since cudnn doesn't support it
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr, true, false);
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, nullptr, true, false);
 }
 
 TEST(LSTMTest, ONNXRuntime_TestLSTMForwardHiddenState) {
@@ -811,8 +789,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardHiddenState) {
   std::vector<float> Y_c_data = {-0.07285583f, -0.02545788f};
 
   LstmOpContext2x1x2x2 context("forward");
-  bool run_on_gpu = true;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, &hidden_state, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, &hidden_state, nullptr, Y_data, Y_h_data, Y_c_data,
                   nullptr, use_bias, use_peepholes);
 }
 
@@ -832,8 +809,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMForwardCellState) {
   std::vector<float> Y_c_data = {0.06408449f, 0.03139432f};
 
   LstmOpContext2x1x2x2 context("forward");
-  bool run_on_gpu = true;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, &hidden_state, &cell_state, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, &hidden_state, &cell_state, Y_data, Y_h_data, Y_c_data,
                   nullptr, use_bias, use_peepholes);
 }
 
@@ -853,8 +829,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMActivation) {
   std::vector<float> Y_c_data = {0.1624992f, 0.04672481f};
 
   LstmOpContext2x1x2x2 context("forward", activations);
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                   nullptr, use_bias, use_peepholes);
 }
 
@@ -882,8 +857,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBatchReallocation) {
   std::vector<float> Y_c_data = {0.1624992f, 0.04672481f};
 
   LstmOpContext2x1x2x2 context(direction, activations);
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                   nullptr, use_bias, use_peepholes);
 
   batch_size = 3;
@@ -912,7 +886,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMBatchReallocation) {
               0.23038f, -0.0239f,
               0.24572f, 0.051626f};
 
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                   nullptr, use_bias, use_peepholes);
 }
 
@@ -945,8 +919,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMOutputWrite) {
 
   std::string direction = "bidirectional";
   LstmOpContext2x1x2x2 context(direction, activations);
-  bool run_on_gpu = false;
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                   nullptr, use_bias, use_peepholes);
 
   batch_size = 3;
@@ -992,7 +965,7 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMOutputWrite) {
               0.22469461f, -0.02200207f,
               0.18284359f, -0.01078442f};
 
-  context.RunTest(run_on_gpu, X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
+  context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data,
                   nullptr, use_bias, use_peepholes);
 }