diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 1e20226232..d6430d81ac 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -31,6 +31,8 @@ #include "core/providers/cpu/math/element_wise_ops.h" #ifdef USE_CUDA #include "core/providers/cuda/gpu_data_transfer.h" +#elif USE_ROCM +#include "core/providers/rocm/gpu_data_transfer.h" #endif #include "core/session/environment.h" #include "core/session/IOBinding.h" @@ -293,6 +295,11 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, #ifdef USE_CUDA AllocateMLValue(TestCudaExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), expected_output_dims, &output_ml_value); +#endif + } else if (allocation_provider == kRocmExecutionProvider) { +#ifdef USE_ROCM + AllocateMLValue(TestRocmExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), expected_output_dims, + &output_ml_value); #endif } else { ORT_THROW("Unsupported provider"); @@ -317,9 +324,9 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, std::cout << "Run returned status: " << st.ErrorMessage() << std::endl; ASSERT_TRUE(st.IsOK()); - if ((is_preallocate_output_vec && allocation_provider == kCudaExecutionProvider) || + if ((is_preallocate_output_vec && (allocation_provider == kCudaExecutionProvider || allocation_provider == kRocmExecutionProvider)) || (output_device && output_device->Type() == OrtDevice::GPU)) { -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) // in this case we need to copy the tensor from cuda to cpu vector& outputs = io_binding->GetOutputs(); ASSERT_EQ(1, outputs.size()); @@ -330,7 +337,11 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, std::unique_ptr cpu_tensor = onnxruntime::make_unique(element_type, shape, cpu_allocator); +#ifdef USE_CUDA cudaStream_t stream = static_cast(static_cast(TestCudaExecutionProvider())->GetComputeStream()); +#elif USE_ROCM + hipStream_t stream = static_cast(static_cast(TestRocmExecutionProvider())->GetComputeStream()); +#endif st = GPUDataTransfer(stream).CopyTensor(rtensor, *cpu_tensor.get(), 0); ASSERT_TRUE(st.IsOK()); OrtValue ml_value; @@ -343,6 +354,10 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, if (allocation_provider == kCudaExecutionProvider) { #ifdef USE_CUDA TestCudaExecutionProvider()->Sync(); +#endif + } else if (allocation_provider == kRocmExecutionProvider) { +#ifdef USE_ROCM + TestRocmExecutionProvider()->Sync(); #endif } VerifyOutputs(io_binding->GetOutputs(), expected_output_dims, expected_values_mul_y); @@ -832,11 +847,15 @@ static void TestBindHelper(const std::string& log_str, InferenceSession session_object{so, GetEnvironment()}; - if (bind_provider_type == kCudaExecutionProvider || run_provider_type == kCudaExecutionProvider) { + if (bind_provider_type == kCudaExecutionProvider || bind_provider_type == kRocmExecutionProvider) { #ifdef USE_CUDA CUDAExecutionProviderInfo epi; epi.device_id = 0; EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique(epi)).IsOK()); +#elif USE_ROCM + ROCMExecutionProviderInfo epi; + epi.device_id = 0; + EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique(epi)).IsOK()); #endif } @@ -944,34 +963,38 @@ TEST(InferenceSessionTests, InvalidInputTypeOfTensorElement) { ASSERT_TRUE(!st.IsOK()); } -#ifdef USE_CUDA - +#if defined(USE_CUDA) || defined(USE_ROCM) +#if USE_CUDA +constexpr const char* kGpuExecutionProvider = kCudaExecutionProvider; +#elif USE_ROCM +constexpr const char* kGpuExecutionProvider = kRocmExecutionProvider; +#endif TEST(InferenceSessionTests, TestBindCuda) { TestBindHelper("TestBindCuda", - kCudaExecutionProvider, - kCudaExecutionProvider, + kGpuExecutionProvider, + kGpuExecutionProvider, false /* don't preallocate output */); } TEST(InferenceSessionTests, TestBindCudaPreallocateOutputOnCuda) { TestBindHelper("TestBindCudaPreallocateOutputOnCuda", - kCudaExecutionProvider, - kCudaExecutionProvider, + kGpuExecutionProvider, + kGpuExecutionProvider, true /* preallocate output on GPU */, - kCudaExecutionProvider); + kGpuExecutionProvider); } TEST(InferenceSessionTests, TestBindCudaPreallocateOutputOnCpu) { TestBindHelper("TestBindCudaPreallocateOutputOnCpu", - kCudaExecutionProvider, - kCudaExecutionProvider, + kGpuExecutionProvider, + kGpuExecutionProvider, true /* preallocate output on CPU */, kCpuExecutionProvider); } TEST(InferenceSessionTests, TestBindCudaPreallocateOutputOnCpu2) { TestBindHelper("TestBindCudaPreallocateOutputOnCpu2", - kCudaExecutionProvider, + kGpuExecutionProvider, kCpuExecutionProvider, true /* preallocate output on CPU */, kCpuExecutionProvider); @@ -981,10 +1004,10 @@ TEST(InferenceSessionTests, TestBindCudaSpecifyOutputDeviceOnCuda) { OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0); TestBindHelper("TestBindCudaPreallocateOutputOnCuda", - kCudaExecutionProvider, - kCudaExecutionProvider, + kGpuExecutionProvider, + kGpuExecutionProvider, false /* preallocate output on GPU */, - kCudaExecutionProvider, + kGpuExecutionProvider, &device /* specify output device */); } @@ -1390,6 +1413,10 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) { CUDAExecutionProviderInfo epi; epi.device_id = 0; EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique(epi)).IsOK()); +#elif USE_ROCM + ROCMExecutionProviderInfo epi; + epi.device_id = 0; + EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique(epi)).IsOK()); #endif status = session_object.Load(model_file_name); @@ -1526,6 +1553,10 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) { CUDAExecutionProviderInfo epi; epi.device_id = 0; EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique(epi)).IsOK()); +#elif USE_ROCM + ROCMExecutionProviderInfo epi; + epi.device_id = 0; + EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique(epi)).IsOK()); #endif status = session_object.Load(model_file_name); diff --git a/onnxruntime/test/framework/test_utils.cc b/onnxruntime/test/framework/test_utils.cc index b1da9ec940..228c565ab5 100644 --- a/onnxruntime/test/framework/test_utils.cc +++ b/onnxruntime/test/framework/test_utils.cc @@ -20,6 +20,14 @@ IExecutionProvider* TestCudaExecutionProvider() { } #endif +#ifdef USE_ROCM +IExecutionProvider* TestRocmExecutionProvider() { + static ROCMExecutionProviderInfo info; + static ROCMExecutionProvider rocm_provider(info); + return &rocm_provider; +} +#endif + #ifdef USE_TENSORRT #if 0 // TODO: TensorRT is shared, can't access these directly anymore IExecutionProvider* TestTensorrtExecutionProvider() { diff --git a/onnxruntime/test/framework/test_utils.h b/onnxruntime/test/framework/test_utils.h index 8d8fd8294d..09c3337c0e 100644 --- a/onnxruntime/test/framework/test_utils.h +++ b/onnxruntime/test/framework/test_utils.h @@ -15,6 +15,9 @@ #ifdef USE_CUDA #include "core/providers/cuda/cuda_execution_provider.h" #endif +#ifdef USE_ROCM +#include "core/providers/rocm/rocm_execution_provider.h" +#endif #ifdef USE_NNAPI #include "core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h" #endif @@ -37,6 +40,10 @@ IExecutionProvider* TestCPUExecutionProvider(); IExecutionProvider* TestCudaExecutionProvider(); #endif +#ifdef USE_ROCM +IExecutionProvider* TestRocmExecutionProvider(); +#endif + #ifdef USE_TENSORRT // Doesn't work with ExecutionProviders class and KernelRegistryManager IExecutionProvider* TestTensorrtExecutionProvider(); diff --git a/orttraining/orttraining/test/session/training_session_test.cc b/orttraining/orttraining/test/session/training_session_test.cc index 4d9b4a9b96..578d4ce1ce 100644 --- a/orttraining/orttraining/test/session/training_session_test.cc +++ b/orttraining/orttraining/test/session/training_session_test.cc @@ -17,6 +17,8 @@ #ifdef USE_CUDA #include "core/providers/cuda/cuda_execution_provider.h" +#elif USE_ROCM +#include "core/providers/rocm/rocm_execution_provider.h" #endif using namespace onnxruntime::logging; @@ -62,7 +64,7 @@ TEST(TrainingSessionTest, LoadOptimState_FullPrecision_FP32Moments_Adam) { RunTrainingSessionLoadOptimTests(k_adam_optimizer_op_name, false, false); } -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) TEST(TrainingSessionTest, LoadOptimState_MixedPrecision_FP32Moments_Adam) { RunTrainingSessionLoadOptimTests(k_adam_optimizer_op_name, true, false); } diff --git a/orttraining/orttraining/test/session/training_session_test_utils.cc b/orttraining/orttraining/test/session/training_session_test_utils.cc index bb2dd5d14d..d5bf7e7cf9 100644 --- a/orttraining/orttraining/test/session/training_session_test_utils.cc +++ b/orttraining/orttraining/test/session/training_session_test_utils.cc @@ -99,7 +99,7 @@ void VerifyState(const DataTransferManager& data_transfer_mgr, const NameMLValMa const auto& e_state_it = expected_state.find(key); ORT_ENFORCE(e_state_it != expected_state.end()); auto& expected_tensor = e_state_it->second.Get(); -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_ROCM) auto& actual_gpu_tensor = a_state_it.second.Get(); // Copying tensor to CPU when cuda is enabled. @@ -182,6 +182,9 @@ std::unique_ptr BuildAndRunTrainingSessionWithChecks( #ifdef USE_CUDA CUDAExecutionProviderInfo xp_info; ORT_THROW_IF_ERROR(training_session->RegisterExecutionProvider(onnxruntime::make_unique(xp_info))); +#elif USE_ROCM + ROCMExecutionProviderInfo xp_info; + ORT_THROW_IF_ERROR(training_session->RegisterExecutionProvider(onnxruntime::make_unique(xp_info))); #endif ORT_THROW_IF_ERROR(training_session->Initialize()); diff --git a/orttraining/orttraining/test/session/training_session_test_utils.h b/orttraining/orttraining/test/session/training_session_test_utils.h index e4b971c9db..008947ddf9 100644 --- a/orttraining/orttraining/test/session/training_session_test_utils.h +++ b/orttraining/orttraining/test/session/training_session_test_utils.h @@ -18,6 +18,8 @@ #ifdef USE_CUDA #include "core/providers/cuda/cuda_execution_provider.h" +#elif USE_ROCM +#include "core/providers/rocm/rocm_execution_provider.h" #endif namespace onnxruntime { diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu index f2138d82f7..160777bc77 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu @@ -322,7 +322,7 @@ __global__ void LambMultiTensorComputeDirectionImpl( const float alpha, const float beta, const float epsilon, - const T1 max_norm, + const float max_norm, const float alpha_correction, const float beta_correction) { const int group_index = chunk_group.block_index_to_tensor_group_index[blockIdx.x]; diff --git a/tools/ci_build/github/pai/pai-excluded-tests.txt b/tools/ci_build/github/pai/pai-excluded-tests.txt index 0564f28d22..37d0470127 100644 --- a/tools/ci_build/github/pai/pai-excluded-tests.txt +++ b/tools/ci_build/github/pai/pai-excluded-tests.txt @@ -39,23 +39,13 @@ ReductionOpTest.ReduceMin_int32 ReductionOpTest.ReduceMin_int8 ReductionOpTest.ReduceSum_double ReductionOpTest.ReduceSumSquare_double -ReductionOpTest.ReduceInfMax ReductionOpTest.ReduceInfMax_double -ReductionOpTest.ReduceInfMin ReductionOpTest.ReduceInfMin_double -ReductionOpTest.ReduceInfSum -ReductionOpTest.ReduceInfLogSum -ReductionOpTest.ReduceInfLogSumExp ReductionOpTest.ReduceInfLogSumExp_double GatherOpTest.Gather_invalid_index_cpu -Scatter.InvalidIndex -LogSoftmaxOperator.LargeNumber -MathOpTest.Pow_int64_float -MathOpTest.Pow_int32_float -MathOpTest.Pow_int64_double +Scatter.InvalidIndex GradientCheckerTest.AddGrad GradientCheckerTest.SubGrad GradientCheckerTest.MulGrad -GradientCheckerTest.ReduceMeanGrad GradientCheckerTest.ReduceL2Grad GradientCheckerTest.DivGrad