enable more unit tests for ROCM EP (#7222)

2026-07-24 19:43:35 +00:00 · 2021-04-02 15:57:08 -07:00 · 2021-04-02 15:57:08 -07:00 · ef88dc912c
commit ef88dc912c
parent afbbeaa30a
11 changed files with 39 additions and 20 deletions
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@ -636,7 +636,7 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
 endif()

 if (onnxruntime_USE_ROCM)
-  target_include_directories(onnxruntime_test_all PRIVATE ${onnxruntime_ROCM_HOME}/include/hiprand ${onnxruntime_ROCM_HOME}/include/rocrand)
+  target_include_directories(onnxruntime_test_all PRIVATE  ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
 endif()

 set(test_data_target onnxruntime_test_all)
--- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
@ -25,8 +25,8 @@ using namespace onnxruntime::test;

 enum TrainingMode { TrainingFalse, TrainingTrue, NoTraining };

-// BiasDropout kernel is only implemented for CUDA
-#ifdef USE_CUDA
+// BiasDropout kernel is only implemented for CUDA/ROCM
+#if defined(USE_CUDA) || defined(USE_ROCM)
 namespace {
 void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_shape, float ratio = -1.0f,
                        TrainingMode training_mode = TrainingTrue, bool use_float16_ratio = false, bool has_residual = true) {
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@ -777,7 +777,7 @@ TEST(MathOpTest, Pow_double_int64) {
  test.Run();
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(MathOpTest, Pow_float16_float16) {
  OpTester test("Pow", 12);
  std::vector<int64_t> dims{4};
@ -787,7 +787,11 @@ TEST(MathOpTest, Pow_float16_float16) {
  test.AddOutput<MLFloat16>("Z", dims, MakeMLFloat16({1.0f, 256.0f, 2.0f, 1.0f}));

  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef USE_CUDA
  execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }

@ -800,7 +804,11 @@ TEST(MathOpTest, Pow_float_float16) {
  test.AddOutput<MLFloat16>("Z", dims, MakeMLFloat16({1.0f, 256.0f, 2.0f, 1.0f}));

  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef USE_CUDA
  execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 #endif
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@ -43,13 +43,15 @@ TEST(GemmOpTest, GemmNoTransBIsInitializer) {
 }

 // Only CUDA kernel has float 16 support
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(GemmOpTest, GemmNoTrans_f16) {
+#ifdef USE_CUDA
  int min_cuda_architecture = 530;
  if (!HasCudaEnvironment(min_cuda_architecture)) {
    LOGS_DEFAULT(WARNING) << "Hardware NOT support FP16";
    return;
  }
+#endif
  OpTester test("Gemm");

  test.AddAttribute("transA", (int64_t)0);
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@ -1311,7 +1311,7 @@ TEST(ReductionOpTest, ReduceSum_int32) {
  test.Run();
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(ReductionOpTest, ReduceSumHalfHalf) {
  OpTester test("ReduceSum");
  test.AddAttribute("keepdims", (int64_t)0);
@ -1465,7 +1465,7 @@ TEST(ReductionOpTest, ReduceSum_batch_by_seq_by_128) {
  }
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(ReductionOpTest, ReduceSum_batch_by_seq_by_30528) {
  test_apex_reduce_sum(4 * 128, 30528);
  test_apex_reduce_sum(4 * 512, 30528);
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@ -82,7 +82,7 @@ TEST(GatherOpTest, Gather_invalid_index_cpu) {
           {kCudaExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kNupharExecutionProvider, kTensorrtExecutionProvider});
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(GatherOpTest, Gather_invalid_index_gpu) {
  OpTester test("Gather");
  // Invalid index 3. data[3] does not exist.
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@ -2082,7 +2082,7 @@ TEST(GradientUtilsTest, InPlaceAccumulatorFloat32) {
  test.Run();
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(GradientUtilsTest, InPlaceAccumulatorFloat16) {
  OpTester test("InPlaceAccumulator", 1, onnxruntime::kMSDomain);

@ -2113,7 +2113,7 @@ TEST(GradientUtilsTest, ZeroGradientFloat32) {
  test.Run();
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(GradientUtilsTest, ZeroGradientFloat16) {
  OpTester test("ZeroGradient", 1, onnxruntime::kMSDomain);

--- a/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc
+++ b/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc
@ -15,9 +15,13 @@

 #include "orttraining/training_ops/cpu/controlflow/event_pool.h"  // TODO: move with PipelineBatchPlanner

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 #include "bert_toy_fetches.h"
+#ifdef USE_CUDA
 #include "core/providers/cuda/cuda_execution_provider.h"
+#elif USE_ROCM
+#include "core/providers/rocm/rocm_execution_provider.h"
+#endif
 #endif

 using namespace onnxruntime::logging;
@ -299,7 +303,7 @@ TEST(GradientGraphBuilderTest, TrainingSession_WithProfiler) {
  ASSERT_TRUE(count > 1);
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 static void RunBertTrainingWithChecks(
    const SessionOptions& so,
    const PathString& backprop_model_file) {
@ -316,9 +320,13 @@ static void RunBertTrainingWithChecks(
  auto model_metadata = res.second;
  std::cout << "Loaded " << model_metadata->graph_name << '\n';

+#ifdef USE_CUDA
  CUDAExecutionProviderInfo xp_info;
  ASSERT_STATUS_OK(training_session->RegisterExecutionProvider(onnxruntime::make_unique<CUDAExecutionProvider>(xp_info)));
-
+#elif USE_ROCM
+  ROCMExecutionProviderInfo xp_info;
+  ASSERT_STATUS_OK(training_session->RegisterExecutionProvider(onnxruntime::make_unique<ROCMExecutionProvider>(xp_info)));
+#endif
  ASSERT_STATUS_OK(training_session->Initialize());

  RunOptions run_options;
@ -494,7 +502,7 @@ TEST(GradientGraphBuilderTest, TrainingSession_BertToy) {
  PathString backprop_model_file;
  ASSERT_STATUS_OK(BuildBackPropGraph(model_path, config, backprop_model_file));

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
  SessionOptions so;
  RunBertTrainingWithChecks(so, backprop_model_file);
 #endif
--- a/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc
@ -10,7 +10,7 @@
 namespace onnxruntime {
 namespace test {

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(IsFiniteTest, Float) {
  OpTester test("IsFinite", 1, kMSDomain);

--- a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc
@ -14,7 +14,7 @@
 namespace onnxruntime {
 namespace test {

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)

 void test_all_1d_true(size_t size) {
  std::unique_ptr<bool[]> p_data(new bool[size]);
@ -103,7 +103,7 @@ TEST_P(ReductionOpTest, ReduceAllL2) {
  test.Run();
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST_P(ReductionOpTest, ReduceAllL2HalfHalf) {
  OpTester test("ReduceAllL2", 1, onnxruntime::kMSDomain, true);
  test.SetDeterminism(GetParam());
@ -345,7 +345,7 @@ TEST(ReductionOpTest, ReduceSumTraining_neg_axis) {
  test.Run();
 }

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(ReductionOpTest, ReduceSumTrainingHalfHalf) {
  OpTester test("ReduceSumTraining", 1, onnxruntime::kMSDomain);
  test.AddAttribute("keepdims", (int64_t)0);
--- a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc
@ -97,13 +97,14 @@ void RunGatherGradTestWithRandomData(
 }
 }  // namespace

-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 //TODO: Currently this cannot pass CI, due to GPU architecture problem
 TEST(GatherOpTest, Gather_axis0_indices2d_half) {
+#ifdef USE_CUDA
  if (NeedSkipIfCudaArchLowerThan(700)) {
    return;
  }
-
+#endif
  OpTester test("Gather");
  test.AddAttribute<int64_t>("axis", 0LL);
  test.AddInput<MLFloat16>("data", {3, 3},