From ef88dc912c6a6f192e82d2c4936f094857ec34dc Mon Sep 17 00:00:00 2001 From: Weixing Zhang Date: Fri, 2 Apr 2021 15:57:08 -0700 Subject: [PATCH] enable more unit tests for ROCM EP (#7222) --- cmake/onnxruntime_unittests.cmake | 2 +- .../test/contrib_ops/bias_dropout_op_test.cc | 4 ++-- .../providers/cpu/math/element_wise_ops_test.cc | 10 +++++++++- onnxruntime/test/providers/cpu/math/gemm_test.cc | 4 +++- .../cpu/reduction/reduction_ops_test.cc | 4 ++-- .../test/providers/cpu/tensor/gather_op_test.cc | 2 +- .../test/gradient/gradient_ops_test.cc | 4 ++-- .../test/graph/gradient_graph_builder_test.cc | 16 ++++++++++++---- .../training_ops/cpu/math/isfinite_ops_test.cc | 2 +- .../cpu/reduction/reduction_ops_test.cc | 6 +++--- .../cpu/tensor/gather_grad_op_test.cc | 5 +++-- 11 files changed, 39 insertions(+), 20 deletions(-) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 9486356930..82a1000e15 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -636,7 +636,7 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS) endif() if (onnxruntime_USE_ROCM) - target_include_directories(onnxruntime_test_all PRIVATE ${onnxruntime_ROCM_HOME}/include/hiprand ${onnxruntime_ROCM_HOME}/include/rocrand) + target_include_directories(onnxruntime_test_all PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining) endif() set(test_data_target onnxruntime_test_all) diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc index f28e313be1..243e3f2caf 100644 --- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc +++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc @@ -25,8 +25,8 @@ using namespace onnxruntime::test; enum TrainingMode { TrainingFalse, TrainingTrue, NoTraining }; -// BiasDropout kernel is only implemented for CUDA -#ifdef USE_CUDA +// BiasDropout kernel is only implemented for CUDA/ROCM +#if defined(USE_CUDA) || defined(USE_ROCM) namespace { void RunBiasDropoutTest(const bool use_mask, const std::vector& input_shape, float ratio = -1.0f, TrainingMode training_mode = TrainingTrue, bool use_float16_ratio = false, bool has_residual = true) { diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index 7d713357f5..e923e16854 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -777,7 +777,7 @@ TEST(MathOpTest, Pow_double_int64) { test.Run(); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(MathOpTest, Pow_float16_float16) { OpTester test("Pow", 12); std::vector dims{4}; @@ -787,7 +787,11 @@ TEST(MathOpTest, Pow_float16_float16) { test.AddOutput("Z", dims, MakeMLFloat16({1.0f, 256.0f, 2.0f, 1.0f})); std::vector> execution_providers; +#ifdef USE_CUDA execution_providers.push_back(DefaultCudaExecutionProvider()); +#elif USE_ROCM + execution_providers.push_back(DefaultRocmExecutionProvider()); +#endif test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } @@ -800,7 +804,11 @@ TEST(MathOpTest, Pow_float_float16) { test.AddOutput("Z", dims, MakeMLFloat16({1.0f, 256.0f, 2.0f, 1.0f})); std::vector> execution_providers; +#ifdef USE_CUDA execution_providers.push_back(DefaultCudaExecutionProvider()); +#elif USE_ROCM + execution_providers.push_back(DefaultRocmExecutionProvider()); +#endif test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } #endif diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc index 3c9859d72b..bc86b3b23b 100644 --- a/onnxruntime/test/providers/cpu/math/gemm_test.cc +++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc @@ -43,13 +43,15 @@ TEST(GemmOpTest, GemmNoTransBIsInitializer) { } // Only CUDA kernel has float 16 support -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(GemmOpTest, GemmNoTrans_f16) { +#ifdef USE_CUDA int min_cuda_architecture = 530; if (!HasCudaEnvironment(min_cuda_architecture)) { LOGS_DEFAULT(WARNING) << "Hardware NOT support FP16"; return; } +#endif OpTester test("Gemm"); test.AddAttribute("transA", (int64_t)0); diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 062f7b2821..8f3596dea2 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -1311,7 +1311,7 @@ TEST(ReductionOpTest, ReduceSum_int32) { test.Run(); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(ReductionOpTest, ReduceSumHalfHalf) { OpTester test("ReduceSum"); test.AddAttribute("keepdims", (int64_t)0); @@ -1465,7 +1465,7 @@ TEST(ReductionOpTest, ReduceSum_batch_by_seq_by_128) { } } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(ReductionOpTest, ReduceSum_batch_by_seq_by_30528) { test_apex_reduce_sum(4 * 128, 30528); test_apex_reduce_sum(4 * 512, 30528); diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc index 8a8d74487c..fd83d41f6c 100644 --- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc @@ -82,7 +82,7 @@ TEST(GatherOpTest, Gather_invalid_index_cpu) { {kCudaExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kNupharExecutionProvider, kTensorrtExecutionProvider}); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(GatherOpTest, Gather_invalid_index_gpu) { OpTester test("Gather"); // Invalid index 3. data[3] does not exist. diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index 75eb95f5a4..1057df0d03 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -2082,7 +2082,7 @@ TEST(GradientUtilsTest, InPlaceAccumulatorFloat32) { test.Run(); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(GradientUtilsTest, InPlaceAccumulatorFloat16) { OpTester test("InPlaceAccumulator", 1, onnxruntime::kMSDomain); @@ -2113,7 +2113,7 @@ TEST(GradientUtilsTest, ZeroGradientFloat32) { test.Run(); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(GradientUtilsTest, ZeroGradientFloat16) { OpTester test("ZeroGradient", 1, onnxruntime::kMSDomain); diff --git a/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc b/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc index fdb2cb036f..c54a9aa51a 100644 --- a/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc +++ b/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc @@ -15,9 +15,13 @@ #include "orttraining/training_ops/cpu/controlflow/event_pool.h" // TODO: move with PipelineBatchPlanner -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) #include "bert_toy_fetches.h" +#ifdef USE_CUDA #include "core/providers/cuda/cuda_execution_provider.h" +#elif USE_ROCM +#include "core/providers/rocm/rocm_execution_provider.h" +#endif #endif using namespace onnxruntime::logging; @@ -299,7 +303,7 @@ TEST(GradientGraphBuilderTest, TrainingSession_WithProfiler) { ASSERT_TRUE(count > 1); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) static void RunBertTrainingWithChecks( const SessionOptions& so, const PathString& backprop_model_file) { @@ -316,9 +320,13 @@ static void RunBertTrainingWithChecks( auto model_metadata = res.second; std::cout << "Loaded " << model_metadata->graph_name << '\n'; +#ifdef USE_CUDA CUDAExecutionProviderInfo xp_info; ASSERT_STATUS_OK(training_session->RegisterExecutionProvider(onnxruntime::make_unique(xp_info))); - +#elif USE_ROCM + ROCMExecutionProviderInfo xp_info; + ASSERT_STATUS_OK(training_session->RegisterExecutionProvider(onnxruntime::make_unique(xp_info))); +#endif ASSERT_STATUS_OK(training_session->Initialize()); RunOptions run_options; @@ -494,7 +502,7 @@ TEST(GradientGraphBuilderTest, TrainingSession_BertToy) { PathString backprop_model_file; ASSERT_STATUS_OK(BuildBackPropGraph(model_path, config, backprop_model_file)); -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) SessionOptions so; RunBertTrainingWithChecks(so, backprop_model_file); #endif diff --git a/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc b/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc index 65cb80a25a..0819f76a93 100644 --- a/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/math/isfinite_ops_test.cc @@ -10,7 +10,7 @@ namespace onnxruntime { namespace test { -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(IsFiniteTest, Float) { OpTester test("IsFinite", 1, kMSDomain); diff --git a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc index 18218b76f2..b5af3b54e2 100644 --- a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc @@ -14,7 +14,7 @@ namespace onnxruntime { namespace test { -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) void test_all_1d_true(size_t size) { std::unique_ptr p_data(new bool[size]); @@ -103,7 +103,7 @@ TEST_P(ReductionOpTest, ReduceAllL2) { test.Run(); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST_P(ReductionOpTest, ReduceAllL2HalfHalf) { OpTester test("ReduceAllL2", 1, onnxruntime::kMSDomain, true); test.SetDeterminism(GetParam()); @@ -345,7 +345,7 @@ TEST(ReductionOpTest, ReduceSumTraining_neg_axis) { test.Run(); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(ReductionOpTest, ReduceSumTrainingHalfHalf) { OpTester test("ReduceSumTraining", 1, onnxruntime::kMSDomain); test.AddAttribute("keepdims", (int64_t)0); diff --git a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc index fe093b7bc9..c6052a8015 100644 --- a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc @@ -97,13 +97,14 @@ void RunGatherGradTestWithRandomData( } } // namespace -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) //TODO: Currently this cannot pass CI, due to GPU architecture problem TEST(GatherOpTest, Gather_axis0_indices2d_half) { +#ifdef USE_CUDA if (NeedSkipIfCudaArchLowerThan(700)) { return; } - +#endif OpTester test("Gather"); test.AddAttribute("axis", 0LL); test.AddInput("data", {3, 3},