Re-enable some of the recently disabled cuda tests (#7873)

2026-07-21 19:18:55 +00:00 · 2021-06-03 14:28:30 -07:00 · 2021-06-03 14:28:30 -07:00 · 8f8b9302a2
commit 8f8b9302a2
parent a118da160d
6 changed files with 66 additions and 97 deletions
--- a/onnxruntime/test/framework/TestAllocatorManager.cc
+++ b/onnxruntime/test/framework/TestAllocatorManager.cc
@ -1,13 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-#undef USE_CUDA  // TODO: Cuda is a shared library, so can't call any Cuda provider methods directly from here
-
 #include "test/framework/TestAllocatorManager.h"
 #include "core/framework/allocatormgr.h"
-#ifdef USE_CUDA
-#include "core/providers/cuda/cuda_allocator.h"
-#endif  //  USE_CUDA

 namespace onnxruntime {
 namespace test {
@ -99,14 +94,6 @@ AllocatorManager::AllocatorManager() {
 Status AllocatorManager::InitializeAllocators() {
  auto cpu_alocator = std::make_unique<CPUAllocator>();
  ORT_RETURN_IF_ERROR(RegisterAllocator(map_, std::move(cpu_alocator), std::numeric_limits<size_t>::max(), true));
-#ifdef USE_CUDA
-  auto cuda_alocator = std::make_unique<CUDAAllocator>(static_cast<OrtDevice::DeviceId>(0), CUDA);
-  ORT_RETURN_IF_ERROR(RegisterAllocator(map_, std::move(cuda_alocator), std::numeric_limits<size_t>::max(), true));
-
-  auto cuda_pinned_alocator = std::make_unique<CUDAPinnedAllocator>(static_cast<OrtDevice::DeviceId>(0), CUDA_PINNED);
-  ORT_RETURN_IF_ERROR(RegisterAllocator(map_, std::move(cuda_pinned_alocator), std::numeric_limits<size_t>::max(), true));
-#endif  // USE_CUDA
-
  return Status::OK();
 }

--- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc
+++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
@ -1,6 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#if 0  // TODO: Can't call these directly from external code as Cuda is now a shared library
 #include "core/graph/onnx_protobuf.h"

 #include "core/session/inference_session.h"
@ -15,24 +14,25 @@
 #include "core/framework/execution_provider.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/session_state.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
 #include "core/graph/op.h"
-#include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
-#include "core/framework/tensorprotoutils.h"
 #include "test/capturing_sink.h"
 #include "test/test_environment.h"
 #include "test/framework/test_utils.h"
 #include "gtest/gtest.h"
 #include "core/util/protobuf_parsing_utils.h"
 #include "test/providers/provider_test_utils.h"
+#include "default_providers.h"
 #include "asserts.h"

 using namespace std;
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::logging;

+namespace onnxruntime {
 namespace test {

 typedef std::vector<onnxruntime::NodeArg*> ArgMap;
@ -121,8 +121,7 @@ TEST(CUDAFenceTests, DISABLED_PartOnCPU) {
  SessionOptions so;
  FenceCudaTestInferenceSession session(so, GetEnvironment());
  LoadInferenceSessionFromModel(session, *model);
-  CUDAExecutionProviderInfo xp_info;
-  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(xp_info)));
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
  ASSERT_TRUE(session.Initialize().IsOK());
  ASSERT_TRUE(1 == CountCopyNodes(graph));

@ -176,8 +175,7 @@ TEST(CUDAFenceTests, TileWithInitializer) {
  SessionOptions so;
  FenceCudaTestInferenceSession session(so, GetEnvironment());
  LoadInferenceSessionFromModel(session, *model);
-  CUDAExecutionProviderInfo xp_info;
-  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(xp_info)));
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
  ASSERT_STATUS_OK(session.Initialize());

  vector<OrtValue> outputs;
@ -242,8 +240,7 @@ TEST(CUDAFenceTests, TileWithComputedInput) {
  SessionOptions so;
  FenceCudaTestInferenceSession session(so, GetEnvironment());
  LoadInferenceSessionFromModel(session, *model);
-  CUDAExecutionProviderInfo xp_info;
-  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(xp_info)));
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
  ASSERT_TRUE(session.Initialize().IsOK());

  vector<OrtValue> outputs;
@ -263,4 +260,3 @@ TEST(CUDAFenceTests, TileWithComputedInput) {

 }  // namespace test
 }  // namespace onnxruntime
-#endif
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@ -1,6 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#undef USE_CUDA  // TODO: Cuda is a shared library, so can't call any Cuda provider methods directly from here

 #include "core/graph/onnx_protobuf.h"
 #include "core/session/inference_session.h"
@ -32,6 +31,7 @@
 #include "core/platform/env.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
+#include "core/providers/cuda/cuda_provider_factory.h"
 #ifdef USE_CUDA
 #include "core/providers/cuda/gpu_data_transfer.h"
 #elif USE_ROCM
@ -66,6 +66,11 @@ struct KernelRegistryAndStatus {
 };
 }  // namespace
 namespace onnxruntime {
+
+#ifdef USE_CUDA
+ProviderInfo_CUDA* GetProviderInfo_CUDA();
+#endif
+
 class FuseAdd : public OpKernel {
 public:
  explicit FuseAdd(const OpKernelInfo& info) : OpKernel(info) {
@ -260,6 +265,7 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
                               ProviderType bind_provider_type,
                               bool is_preallocate_output_vec,
                               ProviderType allocation_provider,
+                               IExecutionProvider *gpu_provider,
                               OrtDevice* output_device) {
  unique_ptr<IOBinding> io_binding;
  Status st = session_object.NewIOBinding(&io_binding);
@ -307,16 +313,8 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
    if (allocation_provider == kCpuExecutionProvider) {
      AllocateMLValue<float>(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), expected_output_dims,
                             &output_ml_value);
-    } else if (allocation_provider == kCudaExecutionProvider) {
-#ifdef USE_CUDA
-      AllocateMLValue<float>(TestCudaExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), expected_output_dims,
-                             &output_ml_value);
-#endif
-    } else if (allocation_provider == kRocmExecutionProvider) {
-#ifdef USE_ROCM
-      AllocateMLValue<float>(TestRocmExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), expected_output_dims,
-                             &output_ml_value);
-#endif
+    } else if (allocation_provider == kCudaExecutionProvider || allocation_provider == kRocmExecutionProvider) {
+      AllocateMLValue<float>(gpu_provider->GetAllocator(0, OrtMemTypeDefault), expected_output_dims, &output_ml_value);
    } else {
      ORT_THROW("Unsupported provider");
    }
@ -354,11 +352,12 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
                                                                  shape,
                                                                  cpu_allocator);
 #ifdef USE_CUDA
-    cudaStream_t stream = static_cast<cudaStream_t>(static_cast<const onnxruntime::CUDAExecutionProvider*>(TestCudaExecutionProvider())->GetComputeStream());
+    cudaStream_t stream = static_cast<cudaStream_t>(gpu_provider->GetComputeStream());
+    st = GetProviderInfo_CUDA()->CreateGPUDataTransfer(stream)->CopyTensor(rtensor, *cpu_tensor.get(), 0);
 #elif USE_ROCM
-    hipStream_t stream = static_cast<hipStream_t>(static_cast<const onnxruntime::ROCMExecutionProvider*>(TestRocmExecutionProvider())->GetComputeStream());
-#endif
+    hipStream_t stream = static_cast<hipStream_t>(gpu_provider->GetComputeStream());
    st = GPUDataTransfer(stream).CopyTensor(rtensor, *cpu_tensor.get(), 0);
+#endif
    ASSERT_TRUE(st.IsOK());
    OrtValue ml_value;
    ml_value.Init(cpu_tensor.release(),
@ -367,14 +366,8 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
    VerifyOutputs({ml_value}, expected_output_dims, expected_values_mul_y);
 #endif
  } else {
-    if (allocation_provider == kCudaExecutionProvider) {
-#ifdef USE_CUDA
-      TestCudaExecutionProvider()->Sync();
-#endif
-    } else if (allocation_provider == kRocmExecutionProvider) {
-#ifdef USE_ROCM
-      TestRocmExecutionProvider()->Sync();
-#endif
+    if (allocation_provider == kCudaExecutionProvider || allocation_provider == kRocmExecutionProvider) {
+      gpu_provider->Sync();
    }
    VerifyOutputs(io_binding->GetOutputs(), expected_output_dims, expected_values_mul_y);
  }
@ -622,9 +615,7 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {

  InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
-  CUDAExecutionProviderInfo epi;
-  epi.device_id = 0;
-  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(epi)).IsOK());
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
  ASSERT_STATUS_OK(session_object.Load(MODEL_URI));
  ASSERT_STATUS_OK(session_object.Initialize());
@ -858,16 +849,21 @@ static void TestBindHelper(const std::string& log_str,
  so.session_log_verbosity_level = 1;  // change to 1 for detailed logging

  InferenceSession session_object{so, GetEnvironment()};
+  IExecutionProvider *gpu_provider{};

  if (bind_provider_type == kCudaExecutionProvider || bind_provider_type == kRocmExecutionProvider) {
 #ifdef USE_CUDA
-    CUDAExecutionProviderInfo epi;
-    epi.device_id = 0;
-    EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(epi)).IsOK());
+    auto provider = DefaultCudaExecutionProvider(); 
+    gpu_provider = provider.get();
+    ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(provider)));
 #elif USE_ROCM
    ROCMExecutionProviderInfo epi;
    epi.device_id = 0;
-    EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<ROCMExecutionProvider>(epi)).IsOK());
+
+    auto provider = std::make_unique<ROCMExecutionProvider>(epi);
+    gpu_provider = provider.get();
+
+    ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(provider)));
 #endif
  }

@ -889,6 +885,7 @@ static void TestBindHelper(const std::string& log_str,
                            bind_provider_type,
                            preallocate_output,
                            allocation_provider,
+                            gpu_provider,
                            output_device);
 }

@ -1481,13 +1478,11 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
  InferenceSession session_object{so, GetEnvironment()};

 #ifdef USE_CUDA
-  CUDAExecutionProviderInfo epi;
-  epi.device_id = 0;
-  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(epi)).IsOK());
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
  ROCMExecutionProviderInfo epi;
  epi.device_id = 0;
-  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<ROCMExecutionProvider>(epi)).IsOK());
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::make_unique<ROCMExecutionProvider>(epi)));
 #endif

  status = session_object.Load(model_file_name);
@ -1621,13 +1616,11 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) {
  InferenceSession session_object{so, GetEnvironment()};

 #ifdef USE_CUDA
-  CUDAExecutionProviderInfo epi;
-  epi.device_id = 0;
-  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(epi)).IsOK());
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
  ROCMExecutionProviderInfo epi;
  epi.device_id = 0;
-  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<ROCMExecutionProvider>(epi)).IsOK());
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::make_unique<ROCMExecutionProvider>(epi)));
 #endif

  status = session_object.Load(model_file_name);
@ -1989,9 +1982,7 @@ TEST(InferenceSessionTests, TestParallelExecutionWithCudaProvider) {
  so.session_logid = "InferenceSessionTests.TestParallelExecutionWithCudaProvider";
  InferenceSession session_object{so, GetEnvironment()};

-  CUDAExecutionProviderInfo epi;
-  epi.device_id = 0;
-  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(epi)).IsOK());
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));

  ASSERT_STATUS_OK(session_object.Load(model_uri));

@ -2012,12 +2003,13 @@ TEST(InferenceSessionTests, TestArenaShrinkageAfterRun) {

  SessionOptions so;
  InferenceSession session_object{so, GetEnvironment()};
-  CUDAExecutionProviderInfo epi;
-  epi.default_memory_arena_cfg = &arena_cfg;
+  OrtCUDAProviderOptions provider_options{};
+  provider_options.default_memory_arena_cfg = &arena_cfg;
+  provider_options.device_id = 0;
+  auto factory = CreateExecutionProviderFactory_Cuda(&provider_options);

-  epi.device_id = 0;
  ASSERT_STATUS_OK(session_object.Load(MODEL_URI));
-  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::make_unique<CUDAExecutionProvider>(epi)).IsOK());
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(factory->CreateProvider()));
  ASSERT_STATUS_OK(session_object.Initialize());

  // Fetch the CUDA allocator to analyze its stats
--- a/onnxruntime/test/framework/memcpy_transformer_test.cc
+++ b/onnxruntime/test/framework/memcpy_transformer_test.cc
@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#undef USE_CUDA  // TODO: Cuda is a shared library, so can't call any Cuda provider methods directly from here

 #include <iterator>

 #include "core/framework/execution_providers.h"
 #include "core/optimizer/transformer_memcpy.h"
 #include "core/graph/model.h"
+#include "default_providers.h"
 #include "gtest/gtest.h"
 #include "test_utils.h"
 #include "test/test_environment.h"
@ -106,8 +106,7 @@ TEST(TransformerTest, MemcpyTransformerTest) {

  KernelRegistryManager kernel_registry_manager;
  ExecutionProviders execution_providers;
-  execution_providers.Add(onnxruntime::kCudaExecutionProvider,
-                          std::make_unique<CUDAExecutionProvider>(CUDAExecutionProviderInfo()));
+  execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider());
  execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                          std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo()));
  KernelRegistryManager test_registry_manager;
@ -162,8 +161,7 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {

  KernelRegistryManager kernel_registry_manager;
  ExecutionProviders execution_providers;
-  execution_providers.Add(onnxruntime::kCudaExecutionProvider,
-                          std::make_unique<CUDAExecutionProvider>(CUDAExecutionProviderInfo()));
+  execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider());
  execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                          std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo()));
  KernelRegistryManager test_registry_manager;
@ -277,8 +275,7 @@ TEST(TransformerTest, TestCopyNodeInsertionInitializerInSubgraph) {

  KernelRegistryManager kernel_registry_manager;
  ExecutionProviders execution_providers;
-  execution_providers.Add(onnxruntime::kCudaExecutionProvider,
-                          std::make_unique<CUDAExecutionProvider>(CUDAExecutionProviderInfo()));
+  execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider());
  execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                          std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo()));
  KernelRegistryManager test_registry_manager;
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@ -1,12 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

+#include <memory>
 #include "default_providers.h"
 #include "providers.h"
 #include "core/providers/cpu/cpu_provider_factory_creator.h"
-#ifdef USE_CUDA
-#include "core/providers/cuda/cuda_provider_factory_creator.h"
-#endif
 #ifdef USE_ROCM
 #include "core/providers/rocm/rocm_provider_factory_creator.h"
 #endif
@ -16,26 +14,6 @@
 #include "core/session/onnxruntime_cxx_api.h"

 namespace onnxruntime {
-
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(
-    const char* device_type, bool enable_vpu_fast_compile, const char* device_id, size_t num_of_threads, bool use_compiled_network, const char* blob_dump_path);
-
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Cuda(const OrtCUDAProviderOptions* provider_options);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nuphar(bool, const char*);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nnapi(uint32_t);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Rknpu();
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(int device_id);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_ACL(int use_arena);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_ArmNN(int use_arena);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_CoreML(uint32_t);
-
-// EP for internal testing
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_InternalTesting(
-    const std::unordered_set<std::string>& supported_ops);
-
 namespace test {

 std::unique_ptr<IExecutionProvider> DefaultCpuExecutionProvider(bool enable_arena) {
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@ -1,9 +1,28 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
+#include "core/providers/providers.h"
 #include "core/framework/execution_provider.h"

 namespace onnxruntime {
+
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_ACL(int use_arena);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_ArmNN(int use_arena);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_CoreML(uint32_t);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Cuda(const OrtCUDAProviderOptions* provider_options);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(int device_id);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nnapi(uint32_t);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nuphar(bool, const char*);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(
+    const char* device_type, bool enable_vpu_fast_compile, const char* device_id, size_t num_of_threads, bool use_compiled_network, const char* blob_dump_path);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Rknpu();
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params);
+
+// EP for internal testing
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_InternalTesting(const std::unordered_set<std::string>& supported_ops);
+
 namespace test {

 // unique_ptr providers with default values for session registration