From 8fa427b264e00fb9e8a2f2b03ca78e808f0d0609 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Mon, 14 Sep 2020 22:48:00 -0700
Subject: [PATCH] Ryanunderhill/backout 5014 (#5167)

* Revert 5014
---
 cmake/onnxruntime_framework.cmake             |   5 +
 cmake/onnxruntime_providers.cmake             |  16 +-
 onnxruntime/core/framework/data_transfer.h    |   3 -
 .../core/framework/provider_bridge_ort.cc     |  71 +++++----
 .../providers/dnnl/dnnl_execution_provider.cc |   8 +-
 .../providers/shared_library/provider_api.h   |  11 ++
 .../provider_bridge_provider.cc               |  14 ++
 .../shared_library/provider_interfaces.h      |  47 +++---
 onnxruntime/core/providers/tensorrt/cuda.cc   | 146 ------------------
 .../core/providers/tensorrt/cuda_allocator.cc |  70 ---------
 .../core/providers/tensorrt/cuda_allocator.h  |  36 -----
 .../core/providers/tensorrt/cuda_fence.cc     |  68 --------
 .../core/providers/tensorrt/cuda_fence.h      |  25 ---
 .../providers/tensorrt/gpu_data_transfer.cc   |  68 --------
 .../providers/tensorrt/gpu_data_transfer.h    |  35 -----
 .../tensorrt/tensorrt_execution_provider.cc   |  52 +++++--
 .../tensorrt/tensorrt_execution_provider.h    |   8 -
 17 files changed, 142 insertions(+), 541 deletions(-)
 delete mode 100644 onnxruntime/core/providers/tensorrt/cuda.cc
 delete mode 100644 onnxruntime/core/providers/tensorrt/cuda_allocator.cc
 delete mode 100644 onnxruntime/core/providers/tensorrt/cuda_allocator.h
 delete mode 100644 onnxruntime/core/providers/tensorrt/cuda_fence.cc
 delete mode 100644 onnxruntime/core/providers/tensorrt/cuda_fence.h
 delete mode 100644 onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc
 delete mode 100644 onnxruntime/core/providers/tensorrt/gpu_data_transfer.h

diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index 0afd668d23..b97cc0ed48 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -24,7 +24,12 @@ add_library(onnxruntime_framework ${onnxruntime_framework_srcs})
 if(onnxruntime_ENABLE_INSTRUMENT)
   target_compile_definitions(onnxruntime_framework PRIVATE ONNXRUNTIME_ENABLE_INSTRUMENT)
 endif()
+if(onnxruntime_USE_TENSORRT)
+# TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP
+target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+else()
 target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+endif()
 onnxruntime_add_include_to_target(onnxruntime_framework onnxruntime_common onnx onnx_proto protobuf::libprotobuf flatbuffers)
 set_target_properties(onnxruntime_framework PROPERTIES FOLDER "ONNXRuntime")
 # need onnx to build to create headers that this project includes
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 7f661a1965..4c1a74d273 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -311,7 +311,6 @@ if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_DNNL)
   if(APPLE)
     set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
     set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
   elseif(WIN32)
     set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
@@ -338,7 +337,6 @@ if (onnxruntime_USE_DNNL)
   add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR})
   # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
-
   target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl onnxruntime_providers_shared)
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dnnl  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
   set_target_properties(onnxruntime_providers_dnnl PROPERTIES FOLDER "ONNXRuntime")
@@ -376,6 +374,7 @@ if (onnxruntime_USE_TENSORRT)
     endif()
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -include algorithm")
     set(PROTOBUF_LIBRARY libprotobuf)
+    set(DISABLED_WARNINGS_FOR_TRT /wd4267 /wd4244 /wd4996 /wd4456)
   endif()
   if ( CMAKE_COMPILER_IS_GNUCC )
     set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-missing-field-initializers")
@@ -406,11 +405,6 @@ if (onnxruntime_USE_TENSORRT)
   file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.cc"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/cu_inc/common.cuh"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cuh"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
@@ -424,15 +418,17 @@ if (onnxruntime_USE_TENSORRT)
   else()
     target_link_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDA_HOME}/lib64)
   endif()
-  target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf flatbuffers)
+  target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf)
   target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
   # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
-
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tensorrt  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
-  set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CUDA)
+  set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CXX)
   set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime")
   target_compile_definitions(onnxruntime_providers_tensorrt PRIVATE ONNXIFI_BUILD_LIBRARY=1)
   target_compile_options(onnxruntime_providers_tensorrt PRIVATE ${DISABLED_WARNINGS_FOR_TRT})
+  if (WIN32)
+    target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4267 /wd4244 /wd4996 /wd4456)
+  endif()
 
   if(APPLE)
     set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/tensorrt/exported_symbols.lst")
diff --git a/onnxruntime/core/framework/data_transfer.h b/onnxruntime/core/framework/data_transfer.h
index d2e4cf5f2a..e2093f0abf 100644
--- a/onnxruntime/core/framework/data_transfer.h
+++ b/onnxruntime/core/framework/data_transfer.h
@@ -26,9 +26,6 @@ class IDataTransfer {
 
   // batched copy. default implementation copies each entry sequentially, and returns on first failure.
   virtual common::Status CopyTensors(const std::vector<SrcDstPair>& src_dst_pairs) const;
-
-  // If this is really a Provider_IDataTransfer, this returns true. Used to convert back & forth with providers efficiently
-  virtual bool IsProviderInterface() const { return false; }
 };
 
 class CPUDataTransfer : public IDataTransfer {
diff --git a/onnxruntime/core/framework/provider_bridge_ort.cc b/onnxruntime/core/framework/provider_bridge_ort.cc
index 114b32dbc9..425d24b658 100644
--- a/onnxruntime/core/framework/provider_bridge_ort.cc
+++ b/onnxruntime/core/framework/provider_bridge_ort.cc
@@ -15,6 +15,15 @@
 #include "core/session/inference_session.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/ort_apis.h"
+
+#ifdef USE_TENSORRT
+#include "core/providers/tensorrt/tensorrt_provider_factory.h"
+#include "core/providers/cuda/cuda_allocator.h"
+#include "core/providers/cuda/gpu_data_transfer.h"
+#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "core/providers/cuda/cuda_common.h"
+#endif
+
 #define PROVIDER_BRIDGE_ORT
 #include "core/providers/shared_library/provider_interfaces.h"
 #include "onnx/common/stl_backports.h"
@@ -60,8 +69,6 @@ struct Provider_IAllocator_Impl : Provider_IAllocator {
   void* Alloc(size_t size) override { return p_->Alloc(size); }
   void Free(void* p) override { return p_->Free(p); }
 
-  FencePtr CreateFence(const Provider_SessionState* session_state) override { return p_->CreateFence(reinterpret_cast<const SessionState*>(session_state)); }
-
   bool IsProviderInterface() const override { return false; }
 
   std::unique_ptr<IAllocator> p_;
@@ -74,22 +81,9 @@ struct ProviderAllocator : IAllocator {
   void* Alloc(size_t size) override { return p_->Alloc(size); }
   void Free(void* p) override { return p_->Free(p); }
 
-  FencePtr CreateFence(const SessionState* session_state) override { return p_->CreateFence(reinterpret_cast<const Provider_SessionState*>(session_state)); }
-
   std::shared_ptr<Provider_IAllocator> p_;
 };
 
-struct IDataTransfer_Wrapper : IDataTransfer {
-  IDataTransfer_Wrapper(std::unique_ptr<Provider_IDataTransfer> p) : p_{std::move(p)} {}
-
-  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override { return p_->CanCopy(src_device, dst_device); }
-  common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override { return p_->CopyTensor(*reinterpret_cast<const Provider_Tensor*>(&src), *reinterpret_cast<Provider_Tensor*>(&dst), exec_queue_id); }
-
-  bool IsProviderInterface() const override { return true; }
-
-  std::unique_ptr<Provider_IDataTransfer> p_;
-};
-
 struct Provider_TensorShapeProto_Dimension_Iterator_Impl : Provider_TensorShapeProto_Dimension_Iterator {
   Provider_TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
 
@@ -210,11 +204,7 @@ struct Provider_IExecutionProvider_Router_Impl : Provider_IExecutionProvider_Rou
   }
 
   std::unique_ptr<IDataTransfer> GetDataTransfer() const override {
-    auto internal = outer_->Provider_GetDataTransfer();
-    if (internal)
-      return onnxruntime::make_unique<IDataTransfer_Wrapper>(std::move(internal));
-    else
-      return nullptr;
+    return std::unique_ptr<IDataTransfer>(reinterpret_cast<IDataTransfer*>(outer_->Provider_GetDataTransfer().release()));
   }
 
   void Provider_InsertAllocator(Provider_AllocatorPtr allocator) override {
@@ -237,8 +227,8 @@ struct ProviderHostImpl : ProviderHost {
     AllocatorCreationInfo info_real{
         [&info](int value) -> std::unique_ptr<IAllocator> {
           auto allocator = info.factory(value);
-          // If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IDeviceAllocator
-          // Otherwise it's really a Provider_IDeviceAllocator_Impl, so we can just unwrap it to get back to the IDeviceAllocator inside
+          // If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IAllocator
+          // Otherwise it's really a Provider_IAllocator_Impl, so we can just unwrap it to get back to the IAllocator inside
           if (allocator->IsProviderInterface())
             return onnxruntime::make_unique<ProviderAllocator>(std::move(allocator));
           else
@@ -264,6 +254,31 @@ struct ProviderHostImpl : ProviderHost {
     return onnxruntime::make_unique<Provider_IExecutionProvider_Router_Impl>(outer, type);
   };
 
+#ifdef USE_TENSORRT
+  std::unique_ptr<Provider_IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override {
+    return onnxruntime::make_unique<Provider_IAllocator_Impl>(onnxruntime::make_unique<CUDAAllocator>(device_id, name));
+  }
+
+  std::unique_ptr<Provider_IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override {
+    return onnxruntime::make_unique<Provider_IAllocator_Impl>(onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, name));
+  }
+
+  std::unique_ptr<Provider_IDataTransfer> CreateGPUDataTransfer() override {
+    return std::unique_ptr<Provider_IDataTransfer>(reinterpret_cast<Provider_IDataTransfer*>(new GPUDataTransfer()));
+  }
+
+  void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override {
+    return cuda::Impl_Cast(input_data, output_data, count);
+  }
+
+  void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override {
+    return cuda::Impl_Cast(input_data, output_data, count);
+  }
+
+  bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
+  bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
+#endif
+
   std::string GetEnvironmentVar(const std::string& var_name) override {
     return Env::Default().GetEnvironmentVar(var_name);
   }
@@ -379,12 +394,9 @@ struct ProviderHostImpl : ProviderHost {
 
   // Provider_DataTransferManager
   Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) override { return reinterpret_cast<const DataTransferManager*>(p)->CopyTensor(*reinterpret_cast<const Tensor*>(&src), *reinterpret_cast<Tensor*>(&dst), exec_queue_id); }
-  const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) override {
-    auto* data_transfer = reinterpret_cast<const DataTransferManager*>(p)->GetDataTransfer(src_device, dst_device);
-    if (data_transfer->IsProviderInterface())
-      return reinterpret_cast<const IDataTransfer_Wrapper*>(data_transfer)->p_.get();
-    return nullptr;
-  }
+
+  // Provider_IDataTransfer
+  void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) override { delete reinterpret_cast<Provider_IDataTransfer*>(p); }
 
   // Provider_IndexedSubGraph_MetaDef
   std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Provider_IndexedSubGraph_MetaDef__construct() override { return std::unique_ptr<Provider_IndexedSubGraph_MetaDef>(reinterpret_cast<Provider_IndexedSubGraph_MetaDef*>(new IndexedSubGraph::MetaDef())); }
@@ -585,9 +597,6 @@ struct ProviderHostImpl : ProviderHost {
   const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept override { return *reinterpret_cast<const Provider_DataTransferManager*>(&reinterpret_cast<const OpKernelInfo*>(p)->GetDataTransferManager()); }
   int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept override { return reinterpret_cast<const OpKernelInfo*>(p)->GetKernelDef().ExecQueueId(); }
 
-  // Provider_SessionState
-  const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) override { return *reinterpret_cast<const Provider_DataTransferManager*>(&reinterpret_cast<const SessionState*>(p)->GetDataTransferMgr()); }
-
   // Provider_Tensor
   float* Provider_Tensor__MutableData_float(Provider_Tensor* p) override { return reinterpret_cast<Tensor*>(p)->MutableData<float>(); }
   const float* Provider_Tensor__Data_float(const Provider_Tensor* p) override { return reinterpret_cast<const Tensor*>(p)->Data<float>(); }
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
index e2d487a940..b03c05ad65 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
@@ -29,16 +29,16 @@ constexpr const char* DNNL_CPU = "DnnlCpu";
 DNNLExecutionProvider::DNNLExecutionProvider(const DNNLExecutionProviderInfo& info)
     : Provider_IExecutionProvider{onnxruntime::kDnnlExecutionProvider} {
   Provider_AllocatorCreationInfo default_memory_info(
-      [](int) {
+      {[](int) {
         return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL, OrtAllocatorType::OrtDeviceAllocator));
-      },
+      }},
       0, info.create_arena);
 
   Provider_AllocatorCreationInfo cpu_memory_info(
-      [](int) {
+      {[](int) {
         return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0,
                                                                       OrtMemTypeCPUOutput));
-      },
+      }},
       0, info.create_arena);
 
   Provider_InsertAllocator(CreateAllocator(default_memory_info));
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index bdde16976b..1f7d304386 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -88,6 +88,13 @@ constexpr const char* kMSDomain = "com.microsoft";
 constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
 constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider";
 
+enum CUDAStreamType : int {
+  kCudaStreamDefault = 0,
+  kCudaStreamCopyIn,
+  kCudaStreamCopyOut,
+  kTotalCudaStreams,
+};
+
 class DataTypeImpl {
  public:
   virtual ~DataTypeImpl() = default;
@@ -104,8 +111,12 @@ template <typename T>
 using IAllocatorUniquePtr = std::unique_ptr<T, std::function<void(T*)>>;
 
 std::unique_ptr<Provider_IAllocator> Provider_CreateCPUAllocator(const OrtMemoryInfo& memory_info);
+std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAAllocator(int16_t device_id, const char* name);
+std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name);
 Provider_AllocatorPtr CreateAllocator(const Provider_AllocatorCreationInfo& info);
 
+std::unique_ptr<Provider_IDataTransfer> Provider_CreateGPUDataTransfer();
+
 std::string GetEnvironmentVar(const std::string& var_name);
 
 class CPUIDInfo {
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 326ea49263..81405f2af0 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -146,6 +146,20 @@ std::unique_ptr<Provider_IAllocator> Provider_CreateCPUAllocator(const OrtMemory
   return g_host->CreateCPUAllocator(info);
 }
 
+#ifdef USE_TENSORRT
+std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAAllocator(int16_t device_id, const char* name) {
+  return g_host->CreateCUDAAllocator(device_id, name);
+}
+
+std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name) {
+  return g_host->CreateCUDAPinnedAllocator(device_id, name);
+}
+
+std::unique_ptr<Provider_IDataTransfer> Provider_CreateGPUDataTransfer() {
+  return g_host->CreateGPUDataTransfer();
+}
+#endif
+
 std::string GetEnvironmentVar(const std::string& var_name) {
   return g_host->GetEnvironmentVar(var_name);
 }
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 02a9fa82ef..62009f2cb9 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -5,7 +5,6 @@
 // In the future the internal implementations could derive from these to remove the need for the wrapper implementations
 
 #include "core/framework/func_api.h"
-#include "core/framework/fence.h"  // For FencePtr
 
 #define PROVIDER_DISALLOW_ALL(TypeName)     \
   TypeName() = delete;                      \
@@ -63,7 +62,6 @@ struct Provider_NodeAttributes;
 struct Provider_OpKernel_Base;
 struct Provider_OpKernelContext;
 struct Provider_OpKernelInfo;
-struct Provider_SessionState;
 struct Provider_Tensor;
 class TensorShape;
 
@@ -119,8 +117,6 @@ struct Provider_IAllocator {
   virtual void Free(void* p) = 0;
   const OrtMemoryInfo& Info() const { return memory_info_; };
 
-  virtual FencePtr CreateFence(const Provider_SessionState* /*session_state*/) { return nullptr; }
-
   virtual bool IsProviderInterface() const { return true; }
 
   template <typename T>
@@ -259,17 +255,6 @@ struct Provider_IExecutionProvider {
   void operator=(const Provider_IExecutionProvider&) = delete;
 };
 
-struct Provider_IDataTransfer {
-  Provider_IDataTransfer() = default;
-  virtual ~Provider_IDataTransfer() {}
-
-  virtual bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const = 0;
-  virtual common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const = 0;
-
-  Provider_IDataTransfer(const Provider_IDataTransfer&) = delete;
-  void operator=(const Provider_IDataTransfer&) = delete;
-};
-
 struct Provider {
   virtual std::shared_ptr<Provider_IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) = 0;
 };
@@ -286,6 +271,18 @@ struct ProviderHost {
 
   virtual std::unique_ptr<Provider_IAllocator> CreateCPUAllocator(const OrtMemoryInfo& memory_info) = 0;
 
+#ifdef USE_TENSORRT
+  virtual std::unique_ptr<Provider_IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) = 0;
+  virtual std::unique_ptr<Provider_IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) = 0;
+  virtual std::unique_ptr<Provider_IDataTransfer> CreateGPUDataTransfer() = 0;
+
+  virtual void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) = 0;
+  virtual void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) = 0;
+
+  virtual bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
+  virtual bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
+#endif
+
   virtual std::unique_ptr<Provider_IExecutionProvider_Router> Create_IExecutionProvider_Router(Provider_IExecutionProvider* outer, const std::string& type) = 0;
 
   virtual std::string GetEnvironmentVar(const std::string& var_name) = 0;
@@ -384,7 +381,9 @@ struct ProviderHost {
 
   // Provider_DataTransferManager
   virtual Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) = 0;
-  virtual const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) = 0;
+
+  // Provider_IDataTransfer
+  virtual void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) = 0;
 
   // Provider_IndexedSubGraph_MetaDef
   virtual std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Provider_IndexedSubGraph_MetaDef__construct() = 0;
@@ -541,9 +540,6 @@ struct ProviderHost {
   virtual const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept = 0;
   virtual int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept = 0;
 
-  // Provider_SessionState
-  virtual const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) = 0;
-
   // Provider_Tensor
   virtual float* Provider_Tensor__MutableData_float(Provider_Tensor* p) = 0;
   virtual const float* Provider_Tensor__Data_float(const Provider_Tensor* p) = 0;
@@ -686,11 +682,18 @@ struct Provider_ComputeCapability {
 
 struct Provider_DataTransferManager {
   Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const { return g_host->Provider_DataTransferManager__CopyTensor(this, src, dst, exec_queue_id); }
-  const Provider_IDataTransfer* GetProviderDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const { return g_host->Provider_DataTransferManager__GetProviderDataTransfer(this, src_device, dst_device); }
 
   PROVIDER_DISALLOW_ALL(Provider_DataTransferManager)
 };
 
+struct Provider_IDataTransfer {
+  static void operator delete(void* p) { g_host->Provider_IDataTransfer__operator_delete(reinterpret_cast<Provider_IDataTransfer*>(p)); }
+
+  Provider_IDataTransfer() = delete;
+  Provider_IDataTransfer(const Provider_IDataTransfer&) = delete;
+  void operator=(const Provider_IDataTransfer&) = delete;
+};
+
 struct Provider_IndexedSubGraph_MetaDef {
   static std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Create() { return g_host->Provider_IndexedSubGraph_MetaDef__construct(); }
   static void operator delete(void* p) { g_host->Provider_IndexedSubGraph_MetaDef__operator_delete(reinterpret_cast<Provider_IndexedSubGraph_MetaDef*>(p)); }
@@ -1015,10 +1018,6 @@ inline Status Provider_OpKernelInfo::GetAttr<float>(const std::string& name, flo
   return GetAttr(name, value);
 }
 
-struct Provider_SessionState {
-  const Provider_DataTransferManager& GetDataTransferManager() const { return g_host->Provider_SessionState__GetDataTransferManager(this); }
-};
-
 struct Provider_Tensor {
   float* MutableData_float() { return g_host->Provider_Tensor__MutableData_float(this); }
   const float* Data_float() const { return g_host->Provider_Tensor__Data_float(this); }
diff --git a/onnxruntime/core/providers/tensorrt/cuda.cc b/onnxruntime/core/providers/tensorrt/cuda.cc
deleted file mode 100644
index 7854f8383c..0000000000
--- a/onnxruntime/core/providers/tensorrt/cuda.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/shared_library/provider_api.h"
-#include "core/providers/cuda/shared_inc/cuda_call.h"
-
-#ifdef _WIN32
-#else  // POSIX
-#include <unistd.h>
-#include <string.h>
-#endif
-
-namespace onnxruntime {
-
-using namespace common;
-
-template <typename ERRTYPE>
-const char* CudaErrString(ERRTYPE x) {
-  ORT_NOT_IMPLEMENTED();
-}
-
-#define CASE_ENUM_TO_STR(x) \
-  case x:                   \
-    return #x
-
-template <>
-const char* CudaErrString<cudaError_t>(cudaError_t x) {
-  cudaDeviceSynchronize();
-  return cudaGetErrorString(x);
-}
-
-template <>
-const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
-  cudaDeviceSynchronize();
-
-  switch (e) {
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_SUCCESS);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
-    CASE_ENUM_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default:
-      return "(look for CUBLAS_STATUS_xxx in cublas_api.h)";
-  }
-}
-
-template <>
-const char* CudaErrString<curandStatus>(curandStatus) {
-  cudaDeviceSynchronize();
-  return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)";
-}
-
-template <>
-const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t e) {
-  cudaDeviceSynchronize();
-  return cudnnGetErrorString(e);
-}
-
-template <>
-const char* CudaErrString<cufftResult>(cufftResult e) {
-  cudaDeviceSynchronize();
-  switch (e) {
-    CASE_ENUM_TO_STR(CUFFT_SUCCESS);
-    CASE_ENUM_TO_STR(CUFFT_ALLOC_FAILED);
-    CASE_ENUM_TO_STR(CUFFT_INVALID_VALUE);
-    CASE_ENUM_TO_STR(CUFFT_INTERNAL_ERROR);
-    CASE_ENUM_TO_STR(CUFFT_SETUP_FAILED);
-    CASE_ENUM_TO_STR(CUFFT_INVALID_SIZE);
-    default:
-      return "Unknown cufft error status";
-  }
-}
-
-#ifdef USE_NCCL
-template <>
-const char* CudaErrString<ncclResult_t>(ncclResult_t e) {
-  cudaDeviceSynchronize();
-  return ncclGetErrorString(e);
-}
-#endif
-
-template <typename ERRTYPE, bool THRW>
-bool CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg) {
-  if (retCode != successCode) {
-    try {
-#ifdef _WIN32
-      auto del = [](char* p) { free(p); };
-      std::unique_ptr<char, decltype(del)> hostname_ptr(nullptr, del);
-      size_t hostname_len = 0;
-      char* hostname = nullptr;
-      if (-1 == _dupenv_s(&hostname, &hostname_len, "COMPUTERNAME"))
-        hostname = "?";
-      else
-        hostname_ptr.reset(hostname);
-#else
-      char hostname[HOST_NAME_MAX];
-      if (gethostname(hostname, HOST_NAME_MAX) != 0)
-        strcpy(hostname, "?");
-#endif
-      int currentCudaDevice;
-      cudaGetDevice(&currentCudaDevice);
-      cudaGetLastError();  // clear last CUDA error
-      static char str[1024];
-      snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; expr=%s; %s",
-               libName, (int)retCode, CudaErrString(retCode), currentCudaDevice,
-               hostname,
-               exprString, msg);
-      if (THRW) {
-        // throw an exception with the error info
-        ORT_THROW(str);
-      } else {
-        LOGS_DEFAULT(ERROR) << str;
-      }
-    } catch (const std::exception& e) {  // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error
-      if (THRW) {
-        ORT_THROW(e.what());
-      } else {
-        LOGS_DEFAULT(ERROR) << e.what();
-      }
-    }
-    return false;
-  }
-  return true;
-}
-
-template bool CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg);
-template bool CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg);
-template bool CudaCall<cublasStatus_t, false>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg);
-template bool CudaCall<cublasStatus_t, true>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg);
-template bool CudaCall<cudnnStatus_t, false>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg);
-template bool CudaCall<cudnnStatus_t, true>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg);
-template bool CudaCall<curandStatus_t, false>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg);
-template bool CudaCall<curandStatus_t, true>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg);
-template bool CudaCall<cufftResult, false>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg);
-template bool CudaCall<cufftResult, true>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg);
-
-#ifdef USE_NCCL
-template bool CudaCall<ncclResult_t, false>(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg);
-#endif
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/cuda_allocator.cc b/onnxruntime/core/providers/tensorrt/cuda_allocator.cc
deleted file mode 100644
index c891a49ace..0000000000
--- a/onnxruntime/core/providers/tensorrt/cuda_allocator.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/shared_library/provider_api.h"
-#include "core/providers/cuda/shared_inc/cuda_call.h"
-
-#include "cuda_allocator.h"
-#include "cuda_fence.h"
-#include "gpu_data_transfer.h"
-
-namespace onnxruntime {
-
-static const GPUDataTransfer* GetGPUDataTransfer(const Provider_SessionState* session_state) {
-  OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0);
-  OrtDevice cpu_device;
-  return static_cast<const GPUDataTransfer*>(session_state->GetDataTransferManager().GetProviderDataTransfer(gpu_device, cpu_device));
-}
-
-void CUDAAllocator::CheckDevice(bool throw_when_fail) const {
-#ifndef NDEBUG
-  // check device to match at debug build
-  // if it's expected to change, call cudaSetDevice instead of the check
-  int current_device;
-  auto cuda_err = cudaGetDevice(&current_device);
-  if (cuda_err == cudaSuccess) {
-    ORT_ENFORCE(current_device == Info().id);
-  } else if (throw_when_fail) {
-    CUDA_CALL_THROW(cuda_err);
-  }
-#else
-  ORT_UNUSED_PARAMETER(throw_when_fail);
-#endif
-}
-
-void* CUDAAllocator::Alloc(size_t size) {
-  CheckDevice(true);
-  void* p = nullptr;
-  if (size > 0) {
-    //BFCArena was updated recently to handle the exception and adjust the request size
-    CUDA_CALL_THROW(cudaMalloc((void**)&p, size));
-  }
-  return p;
-}
-
-void CUDAAllocator::Free(void* p) {
-  CheckDevice(false);  // ignore CUDA failure when free
-  cudaFree(p);         // do not throw error since it's OK for cudaFree to fail during shutdown
-}
-
-FencePtr CUDAAllocator::CreateFence(const Provider_SessionState* session_state) {
-  return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
-}
-
-void* CUDAPinnedAllocator::Alloc(size_t size) {
-  void* p = nullptr;
-  if (size > 0) {
-    CUDA_CALL_THROW(cudaMallocHost((void**)&p, size));
-  }
-  return p;
-}
-
-void CUDAPinnedAllocator::Free(void* p) {
-  CUDA_CALL_THROW(cudaFreeHost(p));
-}
-
-FencePtr CUDAPinnedAllocator::CreateFence(const Provider_SessionState* session_state) {
-  return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/cuda_allocator.h b/onnxruntime/core/providers/tensorrt/cuda_allocator.h
deleted file mode 100644
index 0eea033135..0000000000
--- a/onnxruntime/core/providers/tensorrt/cuda_allocator.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-namespace onnxruntime {
-
-class CUDAAllocator : public Provider_IAllocator {
- public:
-  CUDAAllocator(OrtDevice::DeviceId device_id, const char* name)
-      : Provider_IAllocator(
-            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id),
-                          device_id, OrtMemTypeDefault)) {}
-  void* Alloc(size_t size) override;
-  void Free(void* p) override;
-  FencePtr CreateFence(const Provider_SessionState* session_state) override;
-
- private:
-  void CheckDevice(bool throw_when_fail) const;
-};
-
-//TODO: add a default constructor
-class CUDAPinnedAllocator : public Provider_IAllocator {
- public:
-  CUDAPinnedAllocator(OrtDevice::DeviceId device_id, const char* name)
-      : Provider_IAllocator(
-            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id),
-                          device_id, OrtMemTypeCPUOutput)) {}
-
-  void* Alloc(size_t size) override;
-  void Free(void* p) override;
-  FencePtr CreateFence(const Provider_SessionState* session_state) override;
-};
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/cuda_fence.cc b/onnxruntime/core/providers/tensorrt/cuda_fence.cc
deleted file mode 100644
index 87eaf19289..0000000000
--- a/onnxruntime/core/providers/tensorrt/cuda_fence.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/shared_library/provider_api.h"
-#include "core/providers/cuda/shared_inc/cuda_call.h"
-
-#include "cuda_allocator.h"
-#include "cuda_fence.h"
-#include "gpu_data_transfer.h"
-
-namespace onnxruntime {
-
-constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
-
-CUDAFence::CUDAFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) {
-  // NOTE: cudaEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel
-  // if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU
-  int event_flags = /*cudaEventBlockingSync |*/ cudaEventDisableTiming;
-  CUDA_CALL_THROW(cudaEventCreate(&read_event_, event_flags));
-  CUDA_CALL_THROW(cudaEventCreate(&write_event_, event_flags));
-}
-
-CUDAFence::~CUDAFence() {
-  CUDA_CALL_THROW(cudaEventDestroy(read_event_));
-  CUDA_CALL_THROW(cudaEventDestroy(write_event_));
-}
-
-void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int async_queue_id) {
-  if (provider_type == onnxruntime::kCudaExecutionProvider) {
-    // sync in GPU, the call is non-blocking on CPU
-    CUDA_CALL_THROW(cudaStreamWaitEvent(data_transfer_->GetStream(async_queue_id), write_event_, 0));
-  } else {
-    // sync on CPU for all other providers, this is blocking
-    CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
-  }
-}
-
-void CUDAFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) {
-  if (provider_type == onnxruntime::kCudaExecutionProvider) {
-    // sync in GPU, the call is non-blocking on CPU
-    cudaStream_t stream = data_transfer_->GetStream(queue_id);
-    CUDA_CALL_THROW(cudaStreamWaitEvent(stream, read_event_, 0));
-    CUDA_CALL_THROW(cudaStreamWaitEvent(stream, write_event_, 0));
-  } else {
-    // sync on CPU for all other providers, this is blocking
-    CUDA_CALL_THROW(cudaEventSynchronize(read_event_));
-    CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
-  }
-}
-
-bool CUDAFence::CanRelease() {
-  return cudaEventQuery(read_event_) == cudaSuccess &&
-         cudaEventQuery(write_event_) == cudaSuccess;
-}
-
-void CUDAFence::AfterUsedAsInput(int queue_id) {
-  // update read fence
-  cudaStream_t stream = data_transfer_->GetStream(queue_id);
-  CUDA_CALL_THROW(cudaEventRecord(read_event_, stream));
-}
-
-void CUDAFence::AfterUsedAsOutput(int queue_id) {
-  // update write fence
-  cudaStream_t stream = data_transfer_->GetStream(queue_id);
-  CUDA_CALL_THROW(cudaEventRecord(write_event_, stream));
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/cuda_fence.h b/onnxruntime/core/providers/tensorrt/cuda_fence.h
deleted file mode 100644
index efe50902e0..0000000000
--- a/onnxruntime/core/providers/tensorrt/cuda_fence.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-namespace onnxruntime {
-class GPUDataTransfer;
-
-class CUDAFence : public IFence {
- public:
-  CUDAFence(const GPUDataTransfer* data_transfer);
-  virtual ~CUDAFence();
-  virtual void BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int queue_id) override;
-  virtual void BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) override;
-  virtual void AfterUsedAsInput(int queue_id) override;
-  virtual void AfterUsedAsOutput(int queue_id) override;
-  virtual bool CanRelease() override;
-
- private:
-  cudaEvent_t read_event_;
-  cudaEvent_t write_event_;
-  const GPUDataTransfer* data_transfer_;
-};
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc b/onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc
deleted file mode 100644
index cdc810616d..0000000000
--- a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/shared_library/provider_api.h"
-#include "core/providers/cuda/shared_inc/cuda_call.h"
-
-#include "gpu_data_transfer.h"
-
-#define CUDA_RETURN_IF_ERROR(expr)               \
-  ORT_RETURN_IF_ERROR(CUDA_CALL(expr)            \
-                          ? common::Status::OK() \
-                          : ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA error executing ", #expr))
-
-namespace onnxruntime {
-GPUDataTransfer::GPUDataTransfer() {
-  // create streams, default is nullptr
-  streams_[kCudaStreamDefault] = nullptr;
-  CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking));
-  CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
-}
-
-GPUDataTransfer::~GPUDataTransfer() {
-  CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn]));
-  CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut]));
-}
-
-bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
-  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
-}
-
-common::Status GPUDataTransfer::CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const {
-  size_t bytes = src.SizeInBytes();
-  const void* src_data = src.DataRaw();
-  void* dst_data = dst.MutableDataRaw();
-
-  auto& src_device = src.Location().device;
-  auto& dst_device = dst.Location().device;
-
-  if (dst_device.Type() == OrtDevice::GPU) {
-    if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
-      // copy from pinned memory to GPU, this is non-blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id]));
-    } else if (src_device.Type() == OrtDevice::GPU) {
-      // copying between GPU, this is non-blocking
-      // Copy only if the two addresses are different.
-      if (dst_data != src_data) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault]));
-      }
-    } else {
-      // copy from other CPU memory to GPU, this is blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
-    }
-  } else if (src_device.Type() == OrtDevice::GPU) {
-    if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
-      // copying from GPU to pinned memory, this is non-blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id]));
-    } else {
-      // copying from GPU to CPU memory, this is blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
-    }
-  } else {
-    // copying between cpu memory
-    memcpy(dst_data, src_data, bytes);
-  }
-
-  return Status::OK();
-}
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.h b/onnxruntime/core/providers/tensorrt/gpu_data_transfer.h
deleted file mode 100644
index 9b206664f4..0000000000
--- a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-namespace onnxruntime {
-
-enum CUDAStreamType : int {
-  kCudaStreamDefault = 0,
-  kCudaStreamCopyIn,
-  kCudaStreamCopyOut,
-  kTotalCudaStreams,
-};
-
-class GPUDataTransfer : public Provider_IDataTransfer {
- public:
-  GPUDataTransfer();
-  ~GPUDataTransfer();
-
-  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
-
-  // Suppress MSVC warning about not fully overriding
-  using Provider_IDataTransfer::CopyTensor;
-  common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const override;
-
-  cudaStream_t GetStream(int queue_id) const {
-    ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams);
-    return streams_[queue_id];
-  }
-
- private:
-  cudaStream_t streams_[kTotalCudaStreams];
-};
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 91582a7077..225a17703a 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -2,25 +2,25 @@
 // Licensed under the MIT License.
 
 #include <fstream>
-#include <limits>
 #include <list>
-#include <map>
-#include <memory>
 #include <unordered_set>
-#include <unordered_map>
-#include <utility>
 #include "core/providers/shared_library/provider_api.h"
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/common/safeint.h"
 
 #include "tensorrt_execution_provider.h"
-#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "core/providers/cuda/cuda_allocator.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
-#include "gpu_data_transfer.h"
+#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "cuda_runtime_api.h"
 #include "gsl/gsl"
 #include <experimental/filesystem>
-#include "cuda_allocator.h"
+#include <unordered_map>
+#include <utility>
+#include <limits>
+#include <map>
+#include <memory>
 
 #define CUDA_RETURN_IF_ERROR(expr)               \
   ORT_RETURN_IF_ERROR(CUDA_CALL(expr)            \
@@ -69,6 +69,33 @@ struct ShutdownProtobuf {
 
 namespace onnxruntime {
 
+namespace cuda {
+template <>
+void Impl_Cast(
+    const int64_t* input_data, int32_t* output_data,
+    size_t count) {
+  return g_host->cuda__Impl_Cast(input_data, output_data, count);
+}
+
+template <>
+void Impl_Cast(
+    const int32_t* input_data, int64_t* output_data,
+    size_t count) {
+  return g_host->cuda__Impl_Cast(input_data, output_data, count);
+}
+
+}  // namespace cuda
+
+template <>
+bool CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) {
+  return g_host->CudaCall_false(retCode, exprString, libName, successCode, msg);
+}
+
+template <>
+bool CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) {
+  return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg);
+}
+
 constexpr const char* TRT = "Tensorrt";
 constexpr const char* TRT_PINNED = "TensorrtPinned";
 
@@ -131,7 +158,7 @@ KernelRegistryAndStatus GetTensorrtKernelRegistry() {
 }
 
 std::shared_ptr<Provider_KernelRegistry> TensorrtExecutionProvider::Provider_GetKernelRegistry() const {
-  static KernelRegistryAndStatus k = GetTensorrtKernelRegistry();
+  static KernelRegistryAndStatus k = onnxruntime::GetTensorrtKernelRegistry();
   // throw if the registry failed to initialize
   ORT_THROW_IF_ERROR(k.st);
   return k.kernel_registry;
@@ -148,12 +175,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
   CUDA_CALL_THROW(cudaSetDevice(device_id_));
 
   Provider_AllocatorCreationInfo default_memory_info(
-      [](int id) { return onnxruntime::make_unique<CUDAAllocator>(id, TRT); }, device_id_);
+      [](int id) { return Provider_CreateCUDAAllocator(id, TRT); }, device_id_);
   allocator_ = CreateAllocator(default_memory_info);
   Provider_InsertAllocator(allocator_);
 
   Provider_AllocatorCreationInfo pinned_allocator_info(
-      [](int) { return onnxruntime::make_unique<CUDAPinnedAllocator>(0, TRT_PINNED); }, device_id_);
+      [](int) { return Provider_CreateCUDAPinnedAllocator(0, TRT_PINNED); }, device_id_);
   Provider_InsertAllocator(CreateAllocator(pinned_allocator_info));
 
   // Get environment variables
@@ -209,7 +236,7 @@ Provider_AllocatorPtr TensorrtExecutionProvider::Provider_GetAllocator(int id, O
 }
 
 std::unique_ptr<onnxruntime::Provider_IDataTransfer> TensorrtExecutionProvider::Provider_GetDataTransfer() const {
-  return onnxruntime::make_unique<GPUDataTransfer>();
+  return onnxruntime::Provider_CreateGPUDataTransfer();
 }
 
 // Convert GraphViewer graph to GraphProto
@@ -1345,5 +1372,4 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vector<onn
 
   return Status::OK();
 }
-
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 159776d0e8..973e3a0d6b 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -2,14 +2,6 @@
 // Licensed under the MIT License.
 
 #pragma once
-
-#ifdef _WIN32
-#pragma warning(disable : 4244)  // '=': conversion from '_Ty2' to '_Ty', possible loss of data
-#pragma warning(disable : 4267)  // 'initializing': conversion from '<type1>' to '<type2>', possible loss of data
-#pragma warning(disable : 4996)  // 'gmtime': This function or variable may be unsafe. Consider using gmtime_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
-#pragma warning(disable : 4456)  // declaration of 'i' hides previous local declaration
-#endif
-
 #include <ctime>
 #include "NvInfer.h"
 #include "NvOnnxParser.h"