support MIGraphXEP to work with ROCMEP for inference on AMD GPU (#10368)

Co-authored-by: Weixing Zhang <wezhan@microsoft.com> Support MIGraphXEP to work with ROCMEP for inference on AMD GPU
2026-05-16 21:00:14 +00:00 · 2022-01-26 15:52:56 -08:00 · 2022-01-26 15:52:56 -08:00 · ea9c8a7cdc
commit ea9c8a7cdc
parent 389d2db1ce
14 changed files with 29 additions and 81 deletions
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@ -715,6 +715,7 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
  target_link_libraries(onnxruntime_test_all PRIVATE onnxruntime_language_interop onnxruntime_pyop)
 endif()
 if (onnxruntime_USE_ROCM)
+  target_compile_options(onnxruntime_test_all PRIVATE -D__HIP_PLATFORM_HCC__=1)
  target_include_directories(onnxruntime_test_all PRIVATE  ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
 endif()
 if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@ -38,8 +38,6 @@ constexpr const char* CPU = "Cpu";
 constexpr const char* CUDA = "Cuda";
 constexpr const char* CUDA_PINNED = "CudaPinned";
 constexpr const char* DML = "DML";
-constexpr const char* MIGRAPHX = "MIGraphX";
-constexpr const char* MIGRAPHX_PINNED = "MIGraphXPinned";
 constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
 constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";

--- a/onnxruntime/core/optimizer/transformer_memcpy.cc
+++ b/onnxruntime/core/optimizer/transformer_memcpy.cc
@ -236,7 +236,8 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelReg
      else
        provider_output_defs_.insert(arg);
    }
-  } else if (node_provider_type != kCudaExecutionProvider && node_provider_type != kTensorrtExecutionProvider) {
+  } else if (node_provider_type != kCudaExecutionProvider && node_provider_type != kTensorrtExecutionProvider &&
+             node_provider_type != kRocmExecutionProvider && node_provider_type != kMIGraphXExecutionProvider) {
    // TODO: copy between devices? i.e. multiple GPUs
    if (node_provider_type != onnxruntime::kCpuExecutionProvider &&
        node_provider_type != onnxruntime::kVitisAIExecutionProvider &&
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@ -102,9 +102,9 @@ std::shared_ptr<KernelRegistry> MIGraphXExecutionProvider::GetKernelRegistry() c
 }

 MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, true} {
+    : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, true}, device_id_(info.device_id) {
  // Set GPU device to be used
-  HIP_CALL_THROW(hipSetDevice(info.device_id));
+  HIP_CALL_THROW(hipSetDevice(device_id_));
  t_ = migraphx::target(info.target_device.c_str());

  // whether fp16 is enable
@ -128,7 +128,7 @@ void MIGraphXExecutionProvider::RegisterAllocator(std::shared_ptr<AllocatorManag
  allocator_ = allocator_manager->GetAllocator(device_id_, OrtMemTypeDefault);
  if (nullptr == allocator_) {
    AllocatorCreationInfo default_memory_info(
-        [](OrtDevice::DeviceId device_id) { return CreateHIPAllocator(device_id, onnxruntime::MIGRAPHX); }, device_id_);
+        [](OrtDevice::DeviceId device_id) { return CreateROCMAllocator(device_id, onnxruntime::CUDA); }, device_id_);
    allocator_ = CreateAllocator(default_memory_info);
    allocator_manager->InsertAllocator(allocator_);
  }
@ -141,7 +141,7 @@ void MIGraphXExecutionProvider::RegisterAllocator(std::shared_ptr<AllocatorManag
  if (nullptr == hip_pinned_alloc) {
    AllocatorCreationInfo pinned_allocator_info(
        [](OrtDevice::DeviceId device_id) {
-          return CreateHIPPinnedAllocator(device_id, onnxruntime::MIGRAPHX_PINNED);
+          return CreateROCMPinnedAllocator(device_id, onnxruntime::CUDA_PINNED);
        },
        DEFAULT_CPU_ALLOCATOR_DEVICE_ID);
    hip_pinned_alloc = CreateAllocator(pinned_allocator_info);
--- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
@ -32,34 +32,11 @@ std::unique_ptr<IExecutionProvider> MIGraphXProviderFactory::CreateProvider() {
  return std::make_unique<MIGraphXExecutionProvider>(info_);
 }

-// std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(int device_id) {
-//   MIGraphXExecutionProviderInfo info;
-//   info.device_id = device_id;
-//   return std::make_shared<onnxruntime::MIGraphXProviderFactory>(info);
-// }
-
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(const MIGraphXExecutionProviderInfo& info) {
  return std::make_shared<onnxruntime::MIGraphXProviderFactory>(info);
 }

-
-struct ProviderInfo_MIGRAPHX_Impl : ProviderInfo_MIGRAPHX {
-  std::unique_ptr<IAllocator> CreateHIPAllocator(int16_t device_id, const char* name) override {
-    return std::make_unique<HIPAllocator>(device_id, name);
-  }
-
-  std::unique_ptr<IAllocator> CreateHIPPinnedAllocator(int16_t device_id, const char* name) override {
-    return std::make_unique<HIPPinnedAllocator>(device_id, name);
-  }
-
-  std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override {
-    return std::make_unique<GPUDataTransfer>(static_cast<hipStream_t>(stream));
-  }
-} g_info;
-
 struct MIGraphX_Provider : Provider {
-  void* GetInfo() override { return &g_info; }
-
  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) override {
    MIGraphXExecutionProviderInfo info;
    info.device_id = device_id;
--- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h
@ -10,11 +10,5 @@ struct IExecutionProviderFactory;
 struct MIGraphXExecutionProviderInfo;
 enum class ArenaExtendStrategy : int32_t;
 struct MIGraphXExecutionProviderExternalAllocatorInfo;
-
-struct ProviderInfo_MIGRAPHX {
-  virtual std::unique_ptr<onnxruntime::IAllocator> CreateHIPAllocator(int16_t device_id, const char* name) = 0;
-  virtual std::unique_ptr<onnxruntime::IAllocator> CreateHIPPinnedAllocator(int16_t device_id, const char* name) = 0;
-  virtual std::unique_ptr<onnxruntime::IDataTransfer> CreateGPUDataTransfer(void* stream) = 0;
-};
 }

--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@ -243,8 +243,8 @@ std::unique_ptr<IAllocator> CreateCPUAllocator(const OrtMemoryInfo& memory_info)
 std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name);
 std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name);

-std::unique_ptr<IAllocator> CreateHIPAllocator(int16_t device_id, const char* name);
-std::unique_ptr<IAllocator> CreateHIPPinnedAllocator(int16_t device_id, const char* name);
+std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name);
+std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(int16_t device_id, const char* name);

 std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream);

--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@ -323,12 +323,12 @@ std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) {
 #endif

 #ifdef USE_MIGRAPHX
-std::unique_ptr<IAllocator> CreateHIPAllocator(int16_t device_id, const char* name) {
-  return g_host->CreateHIPAllocator(device_id, name);
+std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name) {
+  return g_host->CreateROCMAllocator(device_id, name);
 }

-std::unique_ptr<IAllocator> CreateHIPPinnedAllocator(int16_t device_id, const char* name) {
-  return g_host->CreateHIPPinnedAllocator(device_id, name);
+std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(int16_t device_id, const char* name) {
+  return g_host->CreateROCMPinnedAllocator(device_id, name);
 }

 std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) {
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@ -151,12 +151,6 @@ struct ProviderHost {
  virtual bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
 #endif

-#ifdef USE_MIGRAPHX
-  virtual std::unique_ptr<IAllocator> CreateHIPAllocator(int16_t device_id, const char* name) = 0;
-  virtual std::unique_ptr<IAllocator> CreateHIPPinnedAllocator(int16_t device_id, const char* name) = 0;
-  virtual std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) = 0;
-#endif
-
 #ifdef USE_ROCM
  virtual std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name) = 0;
  virtual std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(int16_t device_id, const char* name) = 0;
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@ -91,8 +91,6 @@ namespace onnxruntime {

 ProviderInfo_CUDA* TryGetProviderInfo_CUDA();
 ProviderInfo_CUDA& GetProviderInfo_CUDA();
-ProviderInfo_MIGRAPHX* TryGetProviderInfo_MIGRAPHX();
-ProviderInfo_MIGRAPHX& GetProviderInfo_MIGRAPHX();
 ProviderInfo_ROCM* TryGetProviderInfo_ROCM();
 ProviderInfo_ROCM& GetProviderInfo_ROCM();
 ProviderHostCPU& GetProviderHostCPU();
@ -188,12 +186,6 @@ struct ProviderHostImpl : ProviderHost {
  bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg); }
 #endif

-#ifdef USE_MIGRAPHX
-  std::unique_ptr<IAllocator> CreateHIPAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGRAPHX().CreateHIPAllocator(device_id, name); }
-  std::unique_ptr<IAllocator> CreateHIPPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGRAPHX().CreateHIPPinnedAllocator(device_id, name); }
-  std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override { return GetProviderInfo_MIGRAPHX().CreateGPUDataTransfer(stream); }
-#endif
-
 #ifdef USE_ROCM
  std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_ROCM().CreateROCMAllocator(device_id, name); }
  std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_ROCM().CreateROCMPinnedAllocator(device_id, name); }
@ -1084,12 +1076,6 @@ std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const c
  return nullptr;
 }

-std::unique_ptr<IAllocator> CreateHIPPinnedAllocator(int16_t device_id, const char* name) {
-  if (auto* info = onnxruntime::TryGetProviderInfo_MIGRAPHX())
-    return info->CreateHIPPinnedAllocator(device_id, name);
-
-  return nullptr;
-}
 std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(int16_t device_id, const char* name) {
  if (auto* info = onnxruntime::TryGetProviderInfo_ROCM())
    return info->CreateROCMPinnedAllocator(device_id, name);
@ -1199,13 +1185,6 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA() {
  ORT_THROW("CUDA Provider not available, can't get interface for it");
 }

-ProviderInfo_MIGRAPHX* TryGetProviderInfo_MIGRAPHX() {
-  if (auto* provider = s_library_migraphx.Get())
-    return reinterpret_cast<ProviderInfo_MIGRAPHX*>(provider->GetInfo());
-
-  return nullptr;
-}
-
 ProviderInfo_ROCM* TryGetProviderInfo_ROCM() {
  if (auto* provider = s_library_rocm.Get())
    return reinterpret_cast<ProviderInfo_ROCM*>(provider->GetInfo());
@ -1213,13 +1192,6 @@ ProviderInfo_ROCM* TryGetProviderInfo_ROCM() {
  return nullptr;
 }

-ProviderInfo_MIGRAPHX& GetProviderInfo_MIGRAPHX() {
-  if (auto* info = TryGetProviderInfo_MIGRAPHX())
-    return *info;
-
-  ORT_THROW("MIGRAPHX Provider not available, can't get interface for it");
-}
-
 ProviderInfo_ROCM& GetProviderInfo_ROCM() {
  if (auto* info = TryGetProviderInfo_ROCM())
    return *info;
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@ -349,6 +349,8 @@ class InferenceSession(Session):
        # Tensorrt can fall back to CUDA. All others fall back to CPU.
        if 'TensorrtExecutionProvider' in available_providers:
            self._fallback_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        elif 'MIGraphXExecutionProvider' in available_providers:
+            self._fallback_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
        else:
            self._fallback_providers = ['CPUExecutionProvider']

--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@ -385,6 +385,12 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
  } else if (provider_name == onnxruntime::kMIGraphXExecutionProvider) {
 #ifdef USE_MIGRAPHX
    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(session_options, 0));
+
+    OrtROCMProviderOptions rocm_options;
+    rocm_options.miopen_conv_exhaustive_search = performance_test_config.run_config.cudnn_conv_algo;
+    rocm_options.do_copy_in_default_stream = !performance_test_config.run_config.do_cuda_copy_in_separate_stream;
+    // TODO: Support arena configuration for users of perf test
+    session_options.AppendExecutionProvider_ROCM(rocm_options);
 #else
    ORT_THROW("MIGraphX is not supported in this build\n");
 #endif
--- a/setup.py
+++ b/setup.py
@ -216,7 +216,7 @@ try:
                    if len(args) > 3:
                        subprocess.run(args, check=True, stdout=subprocess.PIPE)

-                dest = 'onnxruntime/capi/libonnxruntime_providers_tensorrt.so'
+                dest = 'onnxruntime/capi/libonnxruntime_providers_' + ('migraphx.so' if is_rocm else 'tensorrt.so')
                if path.isfile(dest):
                    result = subprocess.run(['patchelf', '--print-needed', dest],
                                            check=True, stdout=subprocess.PIPE, universal_newlines=True)
@ -253,21 +253,21 @@ except ImportError as error:
    print(error)
    bdist_wheel = None

-providers_cuda_or_rocm = 'libonnxruntime_providers_rocm.so' if is_rocm else 'libonnxruntime_providers_cuda.so'
-
+providers_cuda_or_rocm = 'libonnxruntime_providers_' + ('rocm.so' if is_rocm else 'cuda.so')
+providers_tensorrt_or_migraphx = 'libonnxruntime_providers_' + ('migraphx.so' if is_rocm else 'tensorrt.so')
 # Additional binaries
 if platform.system() == 'Linux':
    libs = ['onnxruntime_pybind11_state.so', 'libdnnl.so.2', 'libmklml_intel.so', 'libmklml_gnu.so', 'libiomp5.so',
            'mimalloc.so']
    dl_libs = ['libonnxruntime_providers_shared.so']
    dl_libs.append(providers_cuda_or_rocm)
-    dl_libs.append('libonnxruntime_providers_tensorrt.so')
+    dl_libs.append(providers_tensorrt_or_migraphx)
    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
    libs.extend(['libonnxruntime_providers_shared.so'])
    libs.extend(['libonnxruntime_providers_dnnl.so'])
-    libs.extend(['libonnxruntime_providers_tensorrt.so'])
    libs.extend(['libonnxruntime_providers_openvino.so'])
    libs.append(providers_cuda_or_rocm)
+    libs.append(providers_tensorrt_or_migraphx)
    # Nuphar Libs
    libs.extend(['libtvm.so.0.5.1'])
    if nightly_build:
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@ -2038,6 +2038,9 @@ def main():
    if args.use_tensorrt:
        args.use_cuda = True

+    if args.use_migraphx:
+        args.use_rocm = True
+
    if args.build_wheel or args.gen_doc:
        args.enable_pybind = True