From ef88dc912c6a6f192e82d2c4936f094857ec34dc Mon Sep 17 00:00:00 2001
From: Weixing Zhang <weixingzhang@users.noreply.github.com>
Date: Fri, 2 Apr 2021 15:57:08 -0700
Subject: [PATCH] enable more unit tests for ROCM EP (#7222)

---
 cmake/onnxruntime_unittests.cmake                |  2 +-
 .../test/contrib_ops/bias_dropout_op_test.cc     |  4 ++--
 .../providers/cpu/math/element_wise_ops_test.cc  | 10 +++++++++-
 onnxruntime/test/providers/cpu/math/gemm_test.cc |  4 +++-
 .../cpu/reduction/reduction_ops_test.cc          |  4 ++--
 .../test/providers/cpu/tensor/gather_op_test.cc  |  2 +-
 .../test/gradient/gradient_ops_test.cc           |  4 ++--
 .../test/graph/gradient_graph_builder_test.cc    | 16 ++++++++++++----
 .../training_ops/cpu/math/isfinite_ops_test.cc   |  2 +-
 .../cpu/reduction/reduction_ops_test.cc          |  6 +++---
 .../cpu/tensor/gather_grad_op_test.cc            |  5 +++--
 11 files changed, 39 insertions(+), 20 deletions(-)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 9486356930..82a1000e15 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -636,7 +636,7 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
 endif()
 
 if (onnxruntime_USE_ROCM)
-  target_include_directories(onnxruntime_test_all PRIVATE ${onnxruntime_ROCM_HOME}/include/hiprand ${onnxruntime_ROCM_HOME}/include/rocrand)
+  target_include_directories(onnxruntime_test_all PRIVATE  ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
 endif()
 
 set(test_data_target onnxruntime_test_all)
diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
index f28e313be1..243e3f2caf 100644
--- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
@@ -25,8 +25,8 @@ using namespace onnxruntime::test;
 
 enum TrainingMode { TrainingFalse, TrainingTrue, NoTraining };
 
-// BiasDropout kernel is only implemented for CUDA
-#ifdef USE_CUDA
+// BiasDropout kernel is only implemented for CUDA/ROCM
+#if defined(USE_CUDA) || defined(USE_ROCM)
 namespace {
 void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_shape, float ratio = -1.0f,
                         TrainingMode training_mode = TrainingTrue, bool use_float16_ratio = false, bool has_residual = true) {
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index 7d713357f5..e923e16854 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -777,7 +777,7 @@ TEST(MathOpTest, Pow_double_int64) {
   test.Run();
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(MathOpTest, Pow_float16_float16) {
   OpTester test("Pow", 12);
   std::vector<int64_t> dims{4};
@@ -787,7 +787,11 @@ TEST(MathOpTest, Pow_float16_float16) {
   test.AddOutput<MLFloat16>("Z", dims, MakeMLFloat16({1.0f, 256.0f, 2.0f, 1.0f}));
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef USE_CUDA
   execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
@@ -800,7 +804,11 @@ TEST(MathOpTest, Pow_float_float16) {
   test.AddOutput<MLFloat16>("Z", dims, MakeMLFloat16({1.0f, 256.0f, 2.0f, 1.0f}));
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef USE_CUDA
   execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 #endif
diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc
index 3c9859d72b..bc86b3b23b 100644
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@@ -43,13 +43,15 @@ TEST(GemmOpTest, GemmNoTransBIsInitializer) {
 }
 
 // Only CUDA kernel has float 16 support
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(GemmOpTest, GemmNoTrans_f16) {
+#ifdef USE_CUDA
   int min_cuda_architecture = 530;
   if (!HasCudaEnvironment(min_cuda_architecture)) {
     LOGS_DEFAULT(WARNING) << "Hardware NOT support FP16";
     return;
   }
+#endif
   OpTester test("Gemm");
 
   test.AddAttribute("transA", (int64_t)0);
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 062f7b2821..8f3596dea2 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -1311,7 +1311,7 @@ TEST(ReductionOpTest, ReduceSum_int32) {
   test.Run();
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(ReductionOpTest, ReduceSumHalfHalf) {
   OpTester test("ReduceSum");
   test.AddAttribute("keepdims", (int64_t)0);
@@ -1465,7 +1465,7 @@ TEST(ReductionOpTest, ReduceSum_batch_by_seq_by_128) {
   }
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(ReductionOpTest, ReduceSum_batch_by_seq_by_30528) {
   test_apex_reduce_sum(4 * 128, 30528);
   test_apex_reduce_sum(4 * 512, 30528);
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index 8a8d74487c..fd83d41f6c 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -82,7 +82,7 @@ TEST(GatherOpTest, Gather_invalid_index_cpu) {
            {kCudaExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kNupharExecutionProvider, kTensorrtExecutionProvider});
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(GatherOpTest, Gather_invalid_index_gpu) {
   OpTester test("Gather");
   // Invalid index 3. data[3] does not exist.
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 75eb95f5a4..1057df0d03 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -2082,7 +2082,7 @@ TEST(GradientUtilsTest, InPlaceAccumulatorFloat32) {
   test.Run();
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(GradientUtilsTest, InPlaceAccumulatorFloat16) {
   OpTester test("InPlaceAccumulator", 1, onnxruntime::kMSDomain);
 
@@ -2113,7 +2113,7 @@ TEST(GradientUtilsTest, ZeroGradientFloat32) {
   test.Run();
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(GradientUtilsTest, ZeroGradientFloat16) {
   OpTester test("ZeroGradient", 1, onnxruntime::kMSDomain);
 
diff --git a/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc b/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc
index fdb2cb036f..c54a9aa51a 100644
--- a/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc
+++ b/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc
@@ -15,9 +15,13 @@
 
 #include "orttraining/training_ops/cpu/controlflow/event_pool.h"  // TODO: move with PipelineBatchPlanner
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 #include "bert_toy_fetches.h"
+#ifdef USE_CUDA
 #include "core/providers/cuda/cuda_execution_provider.h"
+#elif USE_ROCM
+#include "core/providers/rocm/rocm_execution_provider.h"
+#endif
 #endif
 
 using namespace onnxruntime::logging;
@@ -299,7 +303,7 @@ TEST(GradientGraphBuilderTest, TrainingSession_WithProfiler) {
   ASSERT_TRUE(count > 1);
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 static void RunBertTrainingWithChecks(
     const SessionOptions& so,
     const PathString& backprop_model_file) {
@@ -316,9 +320,13 @@ static void RunBertTrainingWithChecks(
   auto model_metadata = res.second;
   std::cout << "Loaded " << model_metadata->graph_name << '\n';
 
+#ifdef USE_CUDA
   CUDAExecutionProviderInfo xp_info;
   ASSERT_STATUS_OK(training_session->RegisterExecutionProvider(onnxruntime::make_unique<CUDAExecutionProvider>(xp_info)));
-
+#elif USE_ROCM
+  ROCMExecutionProviderInfo xp_info;
+  ASSERT_STATUS_OK(training_session->RegisterExecutionProvider(onnxruntime::make_unique<ROCMExecutionProvider>(xp_info)));
+#endif
   ASSERT_STATUS_OK(training_session->Initialize());
 
   RunOptions run_options;
@@ -494,7 +502,7 @@ TEST(GradientGraphBuilderTest, TrainingSession_BertToy) {
   PathString backprop_model_file;
   ASSERT_STATUS_OK(BuildBackPropGraph(model_path, config, backprop_model_file));
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   SessionOptions so;
   RunBertTrainingWithChecks(so, backprop_model_file);
 #endif
diff --git a/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc b/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc
index 65cb80a25a..0819f76a93 100644
--- a/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc
@@ -10,7 +10,7 @@
 namespace onnxruntime {
 namespace test {
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(IsFiniteTest, Float) {
   OpTester test("IsFinite", 1, kMSDomain);
 
diff --git a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc
index 18218b76f2..b5af3b54e2 100644
--- a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc
@@ -14,7 +14,7 @@
 namespace onnxruntime {
 namespace test {
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 
 void test_all_1d_true(size_t size) {
   std::unique_ptr<bool[]> p_data(new bool[size]);
@@ -103,7 +103,7 @@ TEST_P(ReductionOpTest, ReduceAllL2) {
   test.Run();
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST_P(ReductionOpTest, ReduceAllL2HalfHalf) {
   OpTester test("ReduceAllL2", 1, onnxruntime::kMSDomain, true);
   test.SetDeterminism(GetParam());
@@ -345,7 +345,7 @@ TEST(ReductionOpTest, ReduceSumTraining_neg_axis) {
   test.Run();
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(ReductionOpTest, ReduceSumTrainingHalfHalf) {
   OpTester test("ReduceSumTraining", 1, onnxruntime::kMSDomain);
   test.AddAttribute("keepdims", (int64_t)0);
diff --git a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc
index fe093b7bc9..c6052a8015 100644
--- a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc
@@ -97,13 +97,14 @@ void RunGatherGradTestWithRandomData(
 }
 }  // namespace
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 //TODO: Currently this cannot pass CI, due to GPU architecture problem
 TEST(GatherOpTest, Gather_axis0_indices2d_half) {
+#ifdef USE_CUDA
   if (NeedSkipIfCudaArchLowerThan(700)) {
     return;
   }
-
+#endif
   OpTester test("Gather");
   test.AddAttribute<int64_t>("axis", 0LL);
   test.AddInput<MLFloat16>("data", {3, 3},