Ryanunderhill/backout 5014 (#5167)

* Revert 5014
2026-06-21 02:18:09 +00:00 · 2020-09-14 22:48:00 -07:00 · 2020-09-14 22:48:00 -07:00 · 8fa427b264
commit 8fa427b264
parent 089789c135
17 changed files with 142 additions and 541 deletions
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@ -24,7 +24,12 @@ add_library(onnxruntime_framework ${onnxruntime_framework_srcs})
 if(onnxruntime_ENABLE_INSTRUMENT)
  target_compile_definitions(onnxruntime_framework PRIVATE ONNXRUNTIME_ENABLE_INSTRUMENT)
 endif()
+if(onnxruntime_USE_TENSORRT)
+# TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP
+target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+else()
 target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+endif()
 onnxruntime_add_include_to_target(onnxruntime_framework onnxruntime_common onnx onnx_proto protobuf::libprotobuf flatbuffers)
 set_target_properties(onnxruntime_framework PROPERTIES FOLDER "ONNXRuntime")
 # need onnx to build to create headers that this project includes
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@ -311,7 +311,6 @@ if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_DNNL)
  if(APPLE)
    set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
  elseif(UNIX)
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
    set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
  elseif(WIN32)
    set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
@ -338,7 +337,6 @@ if (onnxruntime_USE_DNNL)
  add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
  target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR})
  # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
-
  target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl onnxruntime_providers_shared)
  install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dnnl  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
  set_target_properties(onnxruntime_providers_dnnl PROPERTIES FOLDER "ONNXRuntime")
@ -376,6 +374,7 @@ if (onnxruntime_USE_TENSORRT)
    endif()
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -include algorithm")
    set(PROTOBUF_LIBRARY libprotobuf)
+    set(DISABLED_WARNINGS_FOR_TRT /wd4267 /wd4244 /wd4996 /wd4456)
  endif()
  if ( CMAKE_COMPILER_IS_GNUCC )
    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-missing-field-initializers")
@ -406,11 +405,6 @@ if (onnxruntime_USE_TENSORRT)
  file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS
    "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.h"
    "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.cc"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/cu_inc/common.cuh"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cuh"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.h"
    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
  )
@ -424,15 +418,17 @@ if (onnxruntime_USE_TENSORRT)
  else()
    target_link_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDA_HOME}/lib64)
  endif()
-  target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf flatbuffers)
+  target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf)
  target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
  # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
-
  install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tensorrt  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
-  set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CUDA)
+  set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CXX)
  set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime")
  target_compile_definitions(onnxruntime_providers_tensorrt PRIVATE ONNXIFI_BUILD_LIBRARY=1)
  target_compile_options(onnxruntime_providers_tensorrt PRIVATE ${DISABLED_WARNINGS_FOR_TRT})
+  if (WIN32)
+    target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4267 /wd4244 /wd4996 /wd4456)
+  endif()

  if(APPLE)
    set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/tensorrt/exported_symbols.lst")
--- a/onnxruntime/core/framework/data_transfer.h
+++ b/onnxruntime/core/framework/data_transfer.h
@ -26,9 +26,6 @@ class IDataTransfer {

  // batched copy. default implementation copies each entry sequentially, and returns on first failure.
  virtual common::Status CopyTensors(const std::vector<SrcDstPair>& src_dst_pairs) const;
-
-  // If this is really a Provider_IDataTransfer, this returns true. Used to convert back & forth with providers efficiently
-  virtual bool IsProviderInterface() const { return false; }
 };

 class CPUDataTransfer : public IDataTransfer {
--- a/onnxruntime/core/framework/provider_bridge_ort.cc
+++ b/onnxruntime/core/framework/provider_bridge_ort.cc
@ -15,6 +15,15 @@
 #include "core/session/inference_session.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/ort_apis.h"
+
+#ifdef USE_TENSORRT
+#include "core/providers/tensorrt/tensorrt_provider_factory.h"
+#include "core/providers/cuda/cuda_allocator.h"
+#include "core/providers/cuda/gpu_data_transfer.h"
+#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "core/providers/cuda/cuda_common.h"
+#endif
+
 #define PROVIDER_BRIDGE_ORT
 #include "core/providers/shared_library/provider_interfaces.h"
 #include "onnx/common/stl_backports.h"
@ -60,8 +69,6 @@ struct Provider_IAllocator_Impl : Provider_IAllocator {
  void* Alloc(size_t size) override { return p_->Alloc(size); }
  void Free(void* p) override { return p_->Free(p); }

-  FencePtr CreateFence(const Provider_SessionState* session_state) override { return p_->CreateFence(reinterpret_cast<const SessionState*>(session_state)); }
-
  bool IsProviderInterface() const override { return false; }

  std::unique_ptr<IAllocator> p_;
@ -74,22 +81,9 @@ struct ProviderAllocator : IAllocator {
  void* Alloc(size_t size) override { return p_->Alloc(size); }
  void Free(void* p) override { return p_->Free(p); }

-  FencePtr CreateFence(const SessionState* session_state) override { return p_->CreateFence(reinterpret_cast<const Provider_SessionState*>(session_state)); }
-
  std::shared_ptr<Provider_IAllocator> p_;
 };

-struct IDataTransfer_Wrapper : IDataTransfer {
-  IDataTransfer_Wrapper(std::unique_ptr<Provider_IDataTransfer> p) : p_{std::move(p)} {}
-
-  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override { return p_->CanCopy(src_device, dst_device); }
-  common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override { return p_->CopyTensor(*reinterpret_cast<const Provider_Tensor*>(&src), *reinterpret_cast<Provider_Tensor*>(&dst), exec_queue_id); }
-
-  bool IsProviderInterface() const override { return true; }
-
-  std::unique_ptr<Provider_IDataTransfer> p_;
-};
-
 struct Provider_TensorShapeProto_Dimension_Iterator_Impl : Provider_TensorShapeProto_Dimension_Iterator {
  Provider_TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}

@ -210,11 +204,7 @@ struct Provider_IExecutionProvider_Router_Impl : Provider_IExecutionProvider_Rou
  }

  std::unique_ptr<IDataTransfer> GetDataTransfer() const override {
-    auto internal = outer_->Provider_GetDataTransfer();
-    if (internal)
-      return onnxruntime::make_unique<IDataTransfer_Wrapper>(std::move(internal));
-    else
-      return nullptr;
+    return std::unique_ptr<IDataTransfer>(reinterpret_cast<IDataTransfer*>(outer_->Provider_GetDataTransfer().release()));
  }

  void Provider_InsertAllocator(Provider_AllocatorPtr allocator) override {
@ -237,8 +227,8 @@ struct ProviderHostImpl : ProviderHost {
    AllocatorCreationInfo info_real{
        [&info](int value) -> std::unique_ptr<IAllocator> {
          auto allocator = info.factory(value);
-          // If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IDeviceAllocator
-          // Otherwise it's really a Provider_IDeviceAllocator_Impl, so we can just unwrap it to get back to the IDeviceAllocator inside
+          // If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IAllocator
+          // Otherwise it's really a Provider_IAllocator_Impl, so we can just unwrap it to get back to the IAllocator inside
          if (allocator->IsProviderInterface())
            return onnxruntime::make_unique<ProviderAllocator>(std::move(allocator));
          else
@ -264,6 +254,31 @@ struct ProviderHostImpl : ProviderHost {
    return onnxruntime::make_unique<Provider_IExecutionProvider_Router_Impl>(outer, type);
  };

+#ifdef USE_TENSORRT
+  std::unique_ptr<Provider_IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override {
+    return onnxruntime::make_unique<Provider_IAllocator_Impl>(onnxruntime::make_unique<CUDAAllocator>(device_id, name));
+  }
+
+  std::unique_ptr<Provider_IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override {
+    return onnxruntime::make_unique<Provider_IAllocator_Impl>(onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, name));
+  }
+
+  std::unique_ptr<Provider_IDataTransfer> CreateGPUDataTransfer() override {
+    return std::unique_ptr<Provider_IDataTransfer>(reinterpret_cast<Provider_IDataTransfer*>(new GPUDataTransfer()));
+  }
+
+  void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override {
+    return cuda::Impl_Cast(input_data, output_data, count);
+  }
+
+  void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override {
+    return cuda::Impl_Cast(input_data, output_data, count);
+  }
+
+  bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
+  bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
+#endif
+
  std::string GetEnvironmentVar(const std::string& var_name) override {
    return Env::Default().GetEnvironmentVar(var_name);
  }
@ -379,12 +394,9 @@ struct ProviderHostImpl : ProviderHost {

  // Provider_DataTransferManager
  Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) override { return reinterpret_cast<const DataTransferManager*>(p)->CopyTensor(*reinterpret_cast<const Tensor*>(&src), *reinterpret_cast<Tensor*>(&dst), exec_queue_id); }
-  const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) override {
-    auto* data_transfer = reinterpret_cast<const DataTransferManager*>(p)->GetDataTransfer(src_device, dst_device);
-    if (data_transfer->IsProviderInterface())
-      return reinterpret_cast<const IDataTransfer_Wrapper*>(data_transfer)->p_.get();
-    return nullptr;
-  }
+
+  // Provider_IDataTransfer
+  void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) override { delete reinterpret_cast<Provider_IDataTransfer*>(p); }

  // Provider_IndexedSubGraph_MetaDef
  std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Provider_IndexedSubGraph_MetaDef__construct() override { return std::unique_ptr<Provider_IndexedSubGraph_MetaDef>(reinterpret_cast<Provider_IndexedSubGraph_MetaDef*>(new IndexedSubGraph::MetaDef())); }
@ -585,9 +597,6 @@ struct ProviderHostImpl : ProviderHost {
  const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept override { return *reinterpret_cast<const Provider_DataTransferManager*>(&reinterpret_cast<const OpKernelInfo*>(p)->GetDataTransferManager()); }
  int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept override { return reinterpret_cast<const OpKernelInfo*>(p)->GetKernelDef().ExecQueueId(); }

-  // Provider_SessionState
-  const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) override { return *reinterpret_cast<const Provider_DataTransferManager*>(&reinterpret_cast<const SessionState*>(p)->GetDataTransferMgr()); }
-
  // Provider_Tensor
  float* Provider_Tensor__MutableData_float(Provider_Tensor* p) override { return reinterpret_cast<Tensor*>(p)->MutableData<float>(); }
  const float* Provider_Tensor__Data_float(const Provider_Tensor* p) override { return reinterpret_cast<const Tensor*>(p)->Data<float>(); }
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
@ -29,16 +29,16 @@ constexpr const char* DNNL_CPU = "DnnlCpu";
 DNNLExecutionProvider::DNNLExecutionProvider(const DNNLExecutionProviderInfo& info)
    : Provider_IExecutionProvider{onnxruntime::kDnnlExecutionProvider} {
  Provider_AllocatorCreationInfo default_memory_info(
-      [](int) {
+      {[](int) {
        return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL, OrtAllocatorType::OrtDeviceAllocator));
-      },
+      }},
      0, info.create_arena);

  Provider_AllocatorCreationInfo cpu_memory_info(
-      [](int) {
+      {[](int) {
        return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0,
                                                                      OrtMemTypeCPUOutput));
-      },
+      }},
      0, info.create_arena);

  Provider_InsertAllocator(CreateAllocator(default_memory_info));
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@ -88,6 +88,13 @@ constexpr const char* kMSDomain = "com.microsoft";
 constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
 constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider";

+enum CUDAStreamType : int {
+  kCudaStreamDefault = 0,
+  kCudaStreamCopyIn,
+  kCudaStreamCopyOut,
+  kTotalCudaStreams,
+};
+
 class DataTypeImpl {
 public:
  virtual ~DataTypeImpl() = default;
@ -104,8 +111,12 @@ template <typename T>
 using IAllocatorUniquePtr = std::unique_ptr<T, std::function<void(T*)>>;

 std::unique_ptr<Provider_IAllocator> Provider_CreateCPUAllocator(const OrtMemoryInfo& memory_info);
+std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAAllocator(int16_t device_id, const char* name);
+std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name);
 Provider_AllocatorPtr CreateAllocator(const Provider_AllocatorCreationInfo& info);

+std::unique_ptr<Provider_IDataTransfer> Provider_CreateGPUDataTransfer();
+
 std::string GetEnvironmentVar(const std::string& var_name);

 class CPUIDInfo {
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@ -146,6 +146,20 @@ std::unique_ptr<Provider_IAllocator> Provider_CreateCPUAllocator(const OrtMemory
  return g_host->CreateCPUAllocator(info);
 }

+#ifdef USE_TENSORRT
+std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAAllocator(int16_t device_id, const char* name) {
+  return g_host->CreateCUDAAllocator(device_id, name);
+}
+
+std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name) {
+  return g_host->CreateCUDAPinnedAllocator(device_id, name);
+}
+
+std::unique_ptr<Provider_IDataTransfer> Provider_CreateGPUDataTransfer() {
+  return g_host->CreateGPUDataTransfer();
+}
+#endif
+
 std::string GetEnvironmentVar(const std::string& var_name) {
  return g_host->GetEnvironmentVar(var_name);
 }
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@ -5,7 +5,6 @@
 // In the future the internal implementations could derive from these to remove the need for the wrapper implementations

 #include "core/framework/func_api.h"
-#include "core/framework/fence.h"  // For FencePtr

 #define PROVIDER_DISALLOW_ALL(TypeName)     \
  TypeName() = delete;                      \
@ -63,7 +62,6 @@ struct Provider_NodeAttributes;
 struct Provider_OpKernel_Base;
 struct Provider_OpKernelContext;
 struct Provider_OpKernelInfo;
-struct Provider_SessionState;
 struct Provider_Tensor;
 class TensorShape;

@ -119,8 +117,6 @@ struct Provider_IAllocator {
  virtual void Free(void* p) = 0;
  const OrtMemoryInfo& Info() const { return memory_info_; };

-  virtual FencePtr CreateFence(const Provider_SessionState* /*session_state*/) { return nullptr; }
-
  virtual bool IsProviderInterface() const { return true; }

  template <typename T>
@ -259,17 +255,6 @@ struct Provider_IExecutionProvider {
  void operator=(const Provider_IExecutionProvider&) = delete;
 };

-struct Provider_IDataTransfer {
-  Provider_IDataTransfer() = default;
-  virtual ~Provider_IDataTransfer() {}
-
-  virtual bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const = 0;
-  virtual common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const = 0;
-
-  Provider_IDataTransfer(const Provider_IDataTransfer&) = delete;
-  void operator=(const Provider_IDataTransfer&) = delete;
-};
-
 struct Provider {
  virtual std::shared_ptr<Provider_IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) = 0;
 };
@ -286,6 +271,18 @@ struct ProviderHost {

  virtual std::unique_ptr<Provider_IAllocator> CreateCPUAllocator(const OrtMemoryInfo& memory_info) = 0;

+#ifdef USE_TENSORRT
+  virtual std::unique_ptr<Provider_IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) = 0;
+  virtual std::unique_ptr<Provider_IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) = 0;
+  virtual std::unique_ptr<Provider_IDataTransfer> CreateGPUDataTransfer() = 0;
+
+  virtual void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) = 0;
+  virtual void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) = 0;
+
+  virtual bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
+  virtual bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
+#endif
+
  virtual std::unique_ptr<Provider_IExecutionProvider_Router> Create_IExecutionProvider_Router(Provider_IExecutionProvider* outer, const std::string& type) = 0;

  virtual std::string GetEnvironmentVar(const std::string& var_name) = 0;
@ -384,7 +381,9 @@ struct ProviderHost {

  // Provider_DataTransferManager
  virtual Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) = 0;
-  virtual const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) = 0;
+
+  // Provider_IDataTransfer
+  virtual void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) = 0;

  // Provider_IndexedSubGraph_MetaDef
  virtual std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Provider_IndexedSubGraph_MetaDef__construct() = 0;
@ -541,9 +540,6 @@ struct ProviderHost {
  virtual const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept = 0;
  virtual int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept = 0;

-  // Provider_SessionState
-  virtual const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) = 0;
-
  // Provider_Tensor
  virtual float* Provider_Tensor__MutableData_float(Provider_Tensor* p) = 0;
  virtual const float* Provider_Tensor__Data_float(const Provider_Tensor* p) = 0;
@ -686,11 +682,18 @@ struct Provider_ComputeCapability {

 struct Provider_DataTransferManager {
  Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const { return g_host->Provider_DataTransferManager__CopyTensor(this, src, dst, exec_queue_id); }
-  const Provider_IDataTransfer* GetProviderDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const { return g_host->Provider_DataTransferManager__GetProviderDataTransfer(this, src_device, dst_device); }

  PROVIDER_DISALLOW_ALL(Provider_DataTransferManager)
 };

+struct Provider_IDataTransfer {
+  static void operator delete(void* p) { g_host->Provider_IDataTransfer__operator_delete(reinterpret_cast<Provider_IDataTransfer*>(p)); }
+
+  Provider_IDataTransfer() = delete;
+  Provider_IDataTransfer(const Provider_IDataTransfer&) = delete;
+  void operator=(const Provider_IDataTransfer&) = delete;
+};
+
 struct Provider_IndexedSubGraph_MetaDef {
  static std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Create() { return g_host->Provider_IndexedSubGraph_MetaDef__construct(); }
  static void operator delete(void* p) { g_host->Provider_IndexedSubGraph_MetaDef__operator_delete(reinterpret_cast<Provider_IndexedSubGraph_MetaDef*>(p)); }
@ -1015,10 +1018,6 @@ inline Status Provider_OpKernelInfo::GetAttr<float>(const std::string& name, flo
  return GetAttr(name, value);
 }

-struct Provider_SessionState {
-  const Provider_DataTransferManager& GetDataTransferManager() const { return g_host->Provider_SessionState__GetDataTransferManager(this); }
-};
-
 struct Provider_Tensor {
  float* MutableData_float() { return g_host->Provider_Tensor__MutableData_float(this); }
  const float* Data_float() const { return g_host->Provider_Tensor__Data_float(this); }
--- a/onnxruntime/core/providers/tensorrt/cuda.cc
+++ b/onnxruntime/core/providers/tensorrt/cuda.cc
@ -1,146 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/shared_library/provider_api.h"
-#include "core/providers/cuda/shared_inc/cuda_call.h"
-
-#ifdef _WIN32
-#else  // POSIX
-#include <unistd.h>
-#include <string.h>
-#endif
-
-namespace onnxruntime {
-
-using namespace common;
-
-template <typename ERRTYPE>
-const char* CudaErrString(ERRTYPE x) {
-  ORT_NOT_IMPLEMENTED();
-}
-
-#define CASE_ENUM_TO_STR(x) \
-  case x:                   \
-    return #x
-
-template <>
-const char* CudaErrString<cudaError_t>(cudaError_t x) {
-  cudaDeviceSynchronize();
-  return cudaGetErrorString(x);
-}
-
-template <>
-const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
-  cudaDeviceSynchronize();
-
-  switch (e) {
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_SUCCESS);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default:
-      return "(look for CUBLAS_STATUS_xxx in cublas_api.h)";
-  }
-}
-
-template <>
-const char* CudaErrString<curandStatus>(curandStatus) {
-  cudaDeviceSynchronize();
-  return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)";
-}
-
-template <>
-const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t e) {
-  cudaDeviceSynchronize();
-  return cudnnGetErrorString(e);
-}
-
-template <>
-const char* CudaErrString<cufftResult>(cufftResult e) {
-  cudaDeviceSynchronize();
-  switch (e) {
-    CASE_ENUM_TO_STR(CUFFT_SUCCESS);
-    CASE_ENUM_TO_STR(CUFFT_ALLOC_FAILED);
-    CASE_ENUM_TO_STR(CUFFT_INVALID_VALUE);
-    CASE_ENUM_TO_STR(CUFFT_INTERNAL_ERROR);
-    CASE_ENUM_TO_STR(CUFFT_SETUP_FAILED);
-    CASE_ENUM_TO_STR(CUFFT_INVALID_SIZE);
-    default:
-      return "Unknown cufft error status";
-  }
-}
-
-#ifdef USE_NCCL
-template <>
-const char* CudaErrString<ncclResult_t>(ncclResult_t e) {
-  cudaDeviceSynchronize();
-  return ncclGetErrorString(e);
-}
-#endif
-
-template <typename ERRTYPE, bool THRW>
-bool CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg) {
-  if (retCode != successCode) {
-    try {
-#ifdef _WIN32
-      auto del = [](char* p) { free(p); };
-      std::unique_ptr<char, decltype(del)> hostname_ptr(nullptr, del);
-      size_t hostname_len = 0;
-      char* hostname = nullptr;
-      if (-1 == _dupenv_s(&hostname, &hostname_len, "COMPUTERNAME"))
-        hostname = "?";
-      else
-        hostname_ptr.reset(hostname);
-#else
-      char hostname[HOST_NAME_MAX];
-      if (gethostname(hostname, HOST_NAME_MAX) != 0)
-        strcpy(hostname, "?");
-#endif
-      int currentCudaDevice;
-      cudaGetDevice(&currentCudaDevice);
-      cudaGetLastError();  // clear last CUDA error
-      static char str[1024];
-      snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; expr=%s; %s",
-               libName, (int)retCode, CudaErrString(retCode), currentCudaDevice,
-               hostname,
-               exprString, msg);
-      if (THRW) {
-        // throw an exception with the error info
-        ORT_THROW(str);
-      } else {
-        LOGS_DEFAULT(ERROR) << str;
-      }
-    } catch (const std::exception& e) {  // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error
-      if (THRW) {
-        ORT_THROW(e.what());
-      } else {
-        LOGS_DEFAULT(ERROR) << e.what();
-      }
-    }
-    return false;
-  }
-  return true;
-}
-
-template bool CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg);
-template bool CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg);
-template bool CudaCall<cublasStatus_t, false>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg);
-template bool CudaCall<cublasStatus_t, true>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg);
-template bool CudaCall<cudnnStatus_t, false>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg);
-template bool CudaCall<cudnnStatus_t, true>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg);
-template bool CudaCall<curandStatus_t, false>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg);
-template bool CudaCall<curandStatus_t, true>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg);
-template bool CudaCall<cufftResult, false>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg);
-template bool CudaCall<cufftResult, true>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg);
-
-#ifdef USE_NCCL
-template bool CudaCall<ncclResult_t, false>(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg);
-#endif
-
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/tensorrt/cuda_allocator.cc
+++ b/onnxruntime/core/providers/tensorrt/cuda_allocator.cc
@ -1,70 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/shared_library/provider_api.h"
-#include "core/providers/cuda/shared_inc/cuda_call.h"
-
-#include "cuda_allocator.h"
-#include "cuda_fence.h"
-#include "gpu_data_transfer.h"
-
-namespace onnxruntime {
-
-static const GPUDataTransfer* GetGPUDataTransfer(const Provider_SessionState* session_state) {
-  OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0);
-  OrtDevice cpu_device;
-  return static_cast<const GPUDataTransfer*>(session_state->GetDataTransferManager().GetProviderDataTransfer(gpu_device, cpu_device));
-}
-
-void CUDAAllocator::CheckDevice(bool throw_when_fail) const {
-#ifndef NDEBUG
-  // check device to match at debug build
-  // if it's expected to change, call cudaSetDevice instead of the check
-  int current_device;
-  auto cuda_err = cudaGetDevice(&current_device);
-  if (cuda_err == cudaSuccess) {
-    ORT_ENFORCE(current_device == Info().id);
-  } else if (throw_when_fail) {
-    CUDA_CALL_THROW(cuda_err);
-  }
-#else
-  ORT_UNUSED_PARAMETER(throw_when_fail);
-#endif
-}
-
-void* CUDAAllocator::Alloc(size_t size) {
-  CheckDevice(true);
-  void* p = nullptr;
-  if (size > 0) {
-    //BFCArena was updated recently to handle the exception and adjust the request size
-    CUDA_CALL_THROW(cudaMalloc((void**)&p, size));
-  }
-  return p;
-}
-
-void CUDAAllocator::Free(void* p) {
-  CheckDevice(false);  // ignore CUDA failure when free
-  cudaFree(p);         // do not throw error since it's OK for cudaFree to fail during shutdown
-}
-
-FencePtr CUDAAllocator::CreateFence(const Provider_SessionState* session_state) {
-  return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
-}
-
-void* CUDAPinnedAllocator::Alloc(size_t size) {
-  void* p = nullptr;
-  if (size > 0) {
-    CUDA_CALL_THROW(cudaMallocHost((void**)&p, size));
-  }
-  return p;
-}
-
-void CUDAPinnedAllocator::Free(void* p) {
-  CUDA_CALL_THROW(cudaFreeHost(p));
-}
-
-FencePtr CUDAPinnedAllocator::CreateFence(const Provider_SessionState* session_state) {
-  return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
-}
-
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/tensorrt/cuda_allocator.h
+++ b/onnxruntime/core/providers/tensorrt/cuda_allocator.h
@ -1,36 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-namespace onnxruntime {
-
-class CUDAAllocator : public Provider_IAllocator {
- public:
-  CUDAAllocator(OrtDevice::DeviceId device_id, const char* name)
-      : Provider_IAllocator(
-            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id),
-                          device_id, OrtMemTypeDefault)) {}
-  void* Alloc(size_t size) override;
-  void Free(void* p) override;
-  FencePtr CreateFence(const Provider_SessionState* session_state) override;
-
- private:
-  void CheckDevice(bool throw_when_fail) const;
-};
-
-//TODO: add a default constructor
-class CUDAPinnedAllocator : public Provider_IAllocator {
- public:
-  CUDAPinnedAllocator(OrtDevice::DeviceId device_id, const char* name)
-      : Provider_IAllocator(
-            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id),
-                          device_id, OrtMemTypeCPUOutput)) {}
-
-  void* Alloc(size_t size) override;
-  void Free(void* p) override;
-  FencePtr CreateFence(const Provider_SessionState* session_state) override;
-};
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/tensorrt/cuda_fence.cc
+++ b/onnxruntime/core/providers/tensorrt/cuda_fence.cc
@ -1,68 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/shared_library/provider_api.h"
-#include "core/providers/cuda/shared_inc/cuda_call.h"
-
-#include "cuda_allocator.h"
-#include "cuda_fence.h"
-#include "gpu_data_transfer.h"
-
-namespace onnxruntime {
-
-constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
-
-CUDAFence::CUDAFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) {
-  // NOTE: cudaEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel
-  // if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU
-  int event_flags = /*cudaEventBlockingSync |*/ cudaEventDisableTiming;
-  CUDA_CALL_THROW(cudaEventCreate(&read_event_, event_flags));
-  CUDA_CALL_THROW(cudaEventCreate(&write_event_, event_flags));
-}
-
-CUDAFence::~CUDAFence() {
-  CUDA_CALL_THROW(cudaEventDestroy(read_event_));
-  CUDA_CALL_THROW(cudaEventDestroy(write_event_));
-}
-
-void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int async_queue_id) {
-  if (provider_type == onnxruntime::kCudaExecutionProvider) {
-    // sync in GPU, the call is non-blocking on CPU
-    CUDA_CALL_THROW(cudaStreamWaitEvent(data_transfer_->GetStream(async_queue_id), write_event_, 0));
-  } else {
-    // sync on CPU for all other providers, this is blocking
-    CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
-  }
-}
-
-void CUDAFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) {
-  if (provider_type == onnxruntime::kCudaExecutionProvider) {
-    // sync in GPU, the call is non-blocking on CPU
-    cudaStream_t stream = data_transfer_->GetStream(queue_id);
-    CUDA_CALL_THROW(cudaStreamWaitEvent(stream, read_event_, 0));
-    CUDA_CALL_THROW(cudaStreamWaitEvent(stream, write_event_, 0));
-  } else {
-    // sync on CPU for all other providers, this is blocking
-    CUDA_CALL_THROW(cudaEventSynchronize(read_event_));
-    CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
-  }
-}
-
-bool CUDAFence::CanRelease() {
-  return cudaEventQuery(read_event_) == cudaSuccess &&
-         cudaEventQuery(write_event_) == cudaSuccess;
-}
-
-void CUDAFence::AfterUsedAsInput(int queue_id) {
-  // update read fence
-  cudaStream_t stream = data_transfer_->GetStream(queue_id);
-  CUDA_CALL_THROW(cudaEventRecord(read_event_, stream));
-}
-
-void CUDAFence::AfterUsedAsOutput(int queue_id) {
-  // update write fence
-  cudaStream_t stream = data_transfer_->GetStream(queue_id);
-  CUDA_CALL_THROW(cudaEventRecord(write_event_, stream));
-}
-
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/tensorrt/cuda_fence.h
+++ b/onnxruntime/core/providers/tensorrt/cuda_fence.h
@ -1,25 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-namespace onnxruntime {
-class GPUDataTransfer;
-
-class CUDAFence : public IFence {
- public:
-  CUDAFence(const GPUDataTransfer* data_transfer);
-  virtual ~CUDAFence();
-  virtual void BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int queue_id) override;
-  virtual void BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) override;
-  virtual void AfterUsedAsInput(int queue_id) override;
-  virtual void AfterUsedAsOutput(int queue_id) override;
-  virtual bool CanRelease() override;
-
- private:
-  cudaEvent_t read_event_;
-  cudaEvent_t write_event_;
-  const GPUDataTransfer* data_transfer_;
-};
-
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc
+++ b/onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc
@ -1,68 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/shared_library/provider_api.h"
-#include "core/providers/cuda/shared_inc/cuda_call.h"
-
-#include "gpu_data_transfer.h"
-
-#define CUDA_RETURN_IF_ERROR(expr)               \
-  ORT_RETURN_IF_ERROR(CUDA_CALL(expr)            \
-                          ? common::Status::OK() \
-                          : ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA error executing ", #expr))
-
-namespace onnxruntime {
-GPUDataTransfer::GPUDataTransfer() {
-  // create streams, default is nullptr
-  streams_[kCudaStreamDefault] = nullptr;
-  CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking));
-  CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
-}
-
-GPUDataTransfer::~GPUDataTransfer() {
-  CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn]));
-  CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut]));
-}
-
-bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
-  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
-}
-
-common::Status GPUDataTransfer::CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const {
-  size_t bytes = src.SizeInBytes();
-  const void* src_data = src.DataRaw();
-  void* dst_data = dst.MutableDataRaw();
-
-  auto& src_device = src.Location().device;
-  auto& dst_device = dst.Location().device;
-
-  if (dst_device.Type() == OrtDevice::GPU) {
-    if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
-      // copy from pinned memory to GPU, this is non-blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id]));
-    } else if (src_device.Type() == OrtDevice::GPU) {
-      // copying between GPU, this is non-blocking
-      // Copy only if the two addresses are different.
-      if (dst_data != src_data) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault]));
-      }
-    } else {
-      // copy from other CPU memory to GPU, this is blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
-    }
-  } else if (src_device.Type() == OrtDevice::GPU) {
-    if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
-      // copying from GPU to pinned memory, this is non-blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id]));
-    } else {
-      // copying from GPU to CPU memory, this is blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
-    }
-  } else {
-    // copying between cpu memory
-    memcpy(dst_data, src_data, bytes);
-  }
-
-  return Status::OK();
-}
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.h
+++ b/onnxruntime/core/providers/tensorrt/gpu_data_transfer.h
@ -1,35 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-namespace onnxruntime {
-
-enum CUDAStreamType : int {
-  kCudaStreamDefault = 0,
-  kCudaStreamCopyIn,
-  kCudaStreamCopyOut,
-  kTotalCudaStreams,
-};
-
-class GPUDataTransfer : public Provider_IDataTransfer {
- public:
-  GPUDataTransfer();
-  ~GPUDataTransfer();
-
-  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
-
-  // Suppress MSVC warning about not fully overriding
-  using Provider_IDataTransfer::CopyTensor;
-  common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const override;
-
-  cudaStream_t GetStream(int queue_id) const {
-    ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams);
-    return streams_[queue_id];
-  }
-
- private:
-  cudaStream_t streams_[kTotalCudaStreams];
-};
-
-}  // namespace onnxruntime
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@ -2,25 +2,25 @@
 // Licensed under the MIT License.

 #include <fstream>
-#include <limits>
 #include <list>
-#include <map>
-#include <memory>
 #include <unordered_set>
-#include <unordered_map>
-#include <utility>
 #include "core/providers/shared_library/provider_api.h"
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/common/safeint.h"

 #include "tensorrt_execution_provider.h"
-#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "core/providers/cuda/cuda_allocator.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
-#include "gpu_data_transfer.h"
+#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "cuda_runtime_api.h"
 #include "gsl/gsl"
 #include <experimental/filesystem>
-#include "cuda_allocator.h"
+#include <unordered_map>
+#include <utility>
+#include <limits>
+#include <map>
+#include <memory>

 #define CUDA_RETURN_IF_ERROR(expr)               \
  ORT_RETURN_IF_ERROR(CUDA_CALL(expr)            \
@ -69,6 +69,33 @@ struct ShutdownProtobuf {

 namespace onnxruntime {

+namespace cuda {
+template <>
+void Impl_Cast(
+    const int64_t* input_data, int32_t* output_data,
+    size_t count) {
+  return g_host->cuda__Impl_Cast(input_data, output_data, count);
+}
+
+template <>
+void Impl_Cast(
+    const int32_t* input_data, int64_t* output_data,
+    size_t count) {
+  return g_host->cuda__Impl_Cast(input_data, output_data, count);
+}
+
+}  // namespace cuda
+
+template <>
+bool CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) {
+  return g_host->CudaCall_false(retCode, exprString, libName, successCode, msg);
+}
+
+template <>
+bool CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) {
+  return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg);
+}
+
 constexpr const char* TRT = "Tensorrt";
 constexpr const char* TRT_PINNED = "TensorrtPinned";

@ -131,7 +158,7 @@ KernelRegistryAndStatus GetTensorrtKernelRegistry() {
 }

 std::shared_ptr<Provider_KernelRegistry> TensorrtExecutionProvider::Provider_GetKernelRegistry() const {
-  static KernelRegistryAndStatus k = GetTensorrtKernelRegistry();
+  static KernelRegistryAndStatus k = onnxruntime::GetTensorrtKernelRegistry();
  // throw if the registry failed to initialize
  ORT_THROW_IF_ERROR(k.st);
  return k.kernel_registry;
@ -148,12 +175,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
  CUDA_CALL_THROW(cudaSetDevice(device_id_));

  Provider_AllocatorCreationInfo default_memory_info(
-      [](int id) { return onnxruntime::make_unique<CUDAAllocator>(id, TRT); }, device_id_);
+      [](int id) { return Provider_CreateCUDAAllocator(id, TRT); }, device_id_);
  allocator_ = CreateAllocator(default_memory_info);
  Provider_InsertAllocator(allocator_);

  Provider_AllocatorCreationInfo pinned_allocator_info(
-      [](int) { return onnxruntime::make_unique<CUDAPinnedAllocator>(0, TRT_PINNED); }, device_id_);
+      [](int) { return Provider_CreateCUDAPinnedAllocator(0, TRT_PINNED); }, device_id_);
  Provider_InsertAllocator(CreateAllocator(pinned_allocator_info));

  // Get environment variables
@ -209,7 +236,7 @@ Provider_AllocatorPtr TensorrtExecutionProvider::Provider_GetAllocator(int id, O
 }

 std::unique_ptr<onnxruntime::Provider_IDataTransfer> TensorrtExecutionProvider::Provider_GetDataTransfer() const {
-  return onnxruntime::make_unique<GPUDataTransfer>();
+  return onnxruntime::Provider_CreateGPUDataTransfer();
 }

 // Convert GraphViewer graph to GraphProto
@ -1345,5 +1372,4 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vector<onn

  return Status::OK();
 }
-
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@ -2,14 +2,6 @@
 // Licensed under the MIT License.

 #pragma once
-
-#ifdef _WIN32
-#pragma warning(disable : 4244)  // '=': conversion from '_Ty2' to '_Ty', possible loss of data
-#pragma warning(disable : 4267)  // 'initializing': conversion from '<type1>' to '<type2>', possible loss of data
-#pragma warning(disable : 4996)  // 'gmtime': This function or variable may be unsafe. Consider using gmtime_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
-#pragma warning(disable : 4456)  // declaration of 'i' hides previous local declaration
-#endif
-
 #include <ctime>
 #include "NvInfer.h"
 #include "NvOnnxParser.h"