From 3bf0e364e2efd865d2972dae7196b09e52e2f8cd Mon Sep 17 00:00:00 2001
From: Ke Zhang <kezhan@microsoft.com>
Date: Thu, 11 Jul 2019 14:49:20 -0700
Subject: [PATCH] Move CopyTensor out of IExecutionProvider interface. (#1268)

* add ortdevice class

* add data transfer manager for copying tensors.

* update

* add data trasnfer for gpu

* fix constexpr build break.

* update

* remove unnecessary header files.

* remove unnecessary header files.

* add dependency

* add dependency

* add dependency

* add dependency

* fix linux build break.

* update

* fix build break

* fix build break

* fix build break

* update

* update

* update c api.

* update to not use OrtCreateAllocatorInfo

* change to all eps .

* fix linux build break

* remove useless codes.

* update

* move datatransfermanager in session state

* update

* fix cuda build break.

* fix comments

* fix windows GPU build.

* fix comments

* fix build break

* fix comments

* fix test failure

* update

* fix comments

* fix onnx runtime server.

* update

* fix test failure.

* fix comments

* fix comment
---
 cmake/onnxruntime.cmake                       |  4 ++
 cmake/onnxruntime_session.cmake               |  3 +
 .../onnxruntime/core/framework/allocator.h    | 65 ++++++++++++++++++-
 .../core/framework/execution_provider.h       | 14 ----
 .../core/framework/kernel_registry.h          |  7 +-
 .../core/framework/op_kernel_info.h           | 11 +++-
 onnxruntime/core/framework/allocator.cc       | 12 +++-
 onnxruntime/core/framework/arena.h            |  2 +-
 onnxruntime/core/framework/bfc_arena.cc       |  2 +-
 onnxruntime/core/framework/data_transfer.cc   | 29 +++++++++
 onnxruntime/core/framework/data_transfer.h    | 28 ++++++++
 .../core/framework/data_transfer_manager.cc   | 54 +++++++++++++++
 .../core/framework/data_transfer_manager.h    | 32 +++++++++
 .../core/framework/execution_provider.cc      |  8 ---
 onnxruntime/core/framework/kernel_registry.cc | 16 +++--
 .../core/framework/kernel_registry_manager.cc |  4 +-
 onnxruntime/core/framework/memcpy.cc          |  3 +-
 onnxruntime/core/framework/op_kernel_info.cc  | 14 +++-
 onnxruntime/core/framework/session_state.h    |  6 +-
 .../framework/session_state_initializer.cc    | 17 +++--
 onnxruntime/core/framework/utils.cc           | 28 ++++----
 .../optimizer/optimizer_execution_frame.cc    |  5 +-
 .../optimizer/optimizer_execution_frame.h     |  4 +-
 .../providers/cpu/cpu_execution_provider.h    |  4 --
 .../providers/cpu/cpu_provider_factory.cc     |  3 +-
 .../core/providers/cuda/cuda_allocator.cc     | 16 +++--
 .../core/providers/cuda/cuda_allocator.h      |  4 +-
 onnxruntime/core/providers/cuda/cuda_common.h |  3 +-
 .../providers/cuda/cuda_execution_provider.cc | 46 -------------
 .../providers/cuda/cuda_execution_provider.h  | 12 +---
 onnxruntime/core/providers/cuda/cuda_fence.cc | 11 ++--
 onnxruntime/core/providers/cuda/cuda_fence.h  |  8 ++-
 .../core/providers/cuda/gpu_data_transfer.cc  | 55 ++++++++++++++++
 .../core/providers/cuda/gpu_data_transfer.h   | 35 ++++++++++
 .../mkldnn/mkldnn_execution_provider.cc       | 21 +-----
 .../mkldnn/mkldnn_execution_provider.h        |  2 -
 .../ngraph/ngraph_execution_provider.cc       | 22 +------
 .../ngraph/ngraph_execution_provider.h        |  2 -
 .../openvino/openvino_execution_provider.cc   |  2 +-
 .../openvino/openvino_execution_provider.h    | 10 ---
 .../providers/tensorrt/tensorrt_allocator.h   |  5 +-
 .../tensorrt/tensorrt_execution_provider.cc   |  6 --
 .../tensorrt/tensorrt_execution_provider.h    |  2 -
 onnxruntime/core/session/IOBinding.h          |  2 +-
 .../session/default_cpu_allocator_c_api.cc    |  2 +-
 onnxruntime/core/session/inference_session.cc | 13 +++-
 onnxruntime/core/session/inference_session.h  |  2 +
 onnxruntime/core/session/onnxruntime_c_api.cc |  2 +-
 onnxruntime/server/executor.cc                |  4 +-
 .../test/framework/allocation_planner_test.cc |  2 +-
 onnxruntime/test/framework/allocator_test.cc  |  2 +-
 onnxruntime/test/framework/dummy_provider.h   | 17 -----
 .../test/framework/inference_session_test.cc  | 12 ++--
 onnxruntime/test/framework/op_kernel_test.cc  |  7 --
 .../test/framework/session_state_test.cc      |  2 +-
 onnxruntime/test/onnx/TestCase.cc             |  4 +-
 onnxruntime/test/shared_lib/test_allocator.cc |  2 +-
 onnxruntime/test/util/test_allocator.cc       |  2 +-
 58 files changed, 459 insertions(+), 253 deletions(-)
 create mode 100644 onnxruntime/core/framework/data_transfer.cc
 create mode 100644 onnxruntime/core/framework/data_transfer.h
 create mode 100644 onnxruntime/core/framework/data_transfer_manager.cc
 create mode 100644 onnxruntime/core/framework/data_transfer_manager.h
 create mode 100644 onnxruntime/core/providers/cuda/gpu_data_transfer.cc
 create mode 100644 onnxruntime/core/providers/cuda/gpu_data_transfer.h

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 7122d05f92..91508a8aa8 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -31,6 +31,10 @@ add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEP
 target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT})
 onnxruntime_add_include_to_target(onnxruntime gsl)
 
+if (onnxruntime_USE_CUDA)
+  target_include_directories(onnxruntime PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+endif()
+
 if(UNIX)
   if (APPLE)
     set(BEGIN_WHOLE_ARCHIVE -Xlinker -all_load)
diff --git a/cmake/onnxruntime_session.cmake b/cmake/onnxruntime_session.cmake
index ba0d8129ac..96e9fa55bc 100644
--- a/cmake/onnxruntime_session.cmake
+++ b/cmake/onnxruntime_session.cmake
@@ -15,6 +15,9 @@ onnxruntime_add_include_to_target(onnxruntime_session onnxruntime_common onnxrun
 target_include_directories(onnxruntime_session PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS})
 add_dependencies(onnxruntime_session ${onnxruntime_EXTERNAL_DEPENDENCIES})
 set_target_properties(onnxruntime_session PROPERTIES FOLDER "ONNXRuntime")
+if (onnxruntime_USE_CUDA)
+  target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+endif()
 
 if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
   add_definitions(-DENABLE_LANGUAGE_INTEROP_OPS)
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index 462aed63f1..8a37553ea9 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -15,21 +15,80 @@
 #include "core/framework/fence.h"
 #include "core/session/onnxruntime_c_api.h"
 
+// Struct to represent a physical device.
+struct OrtDevice {
+  using DeviceType = int8_t;
+  using MemoryType = int8_t;
+  using DeviceId = int16_t;
+
+  // Pre-defined device types.
+  static const DeviceType CPU = 0;
+  static const DeviceType GPU = 1;  //CUDA
+  static const DeviceType FPGA = 2;
+
+  struct MemType {
+    // Pre-defined memory types.
+    static const MemoryType DEFAULT = 0;
+    static const MemoryType CUDA_PINNED = 1;
+  };
+
+  constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_)
+      : device_type(device_type_),
+        memory_type(memory_type_),
+        device_id(device_id_) {}
+
+  constexpr OrtDevice() : OrtDevice(CPU, MemType::DEFAULT, 0) {}
+
+  DeviceType Type() const {
+    return device_type;
+  }
+
+  MemoryType MemType() const {
+    return memory_type;
+  }
+
+  DeviceId Id() const {
+    return device_id;
+  }
+
+  std::string ToString() const {
+    std::ostringstream ostr;
+    ostr << "Device: ["
+         << " type:" << static_cast<int>(device_type)
+         << " memory_type:" << static_cast<int>(memory_type)
+         << " device_id:" << device_id
+         << "]";
+    return ostr.str();
+  }
+
+ private:
+  // Device type.
+  DeviceType device_type;
+
+  // Memory type.
+  MemoryType memory_type;
+
+  // Device index.
+  DeviceId device_id;
+};
+
 struct OrtAllocatorInfo {
   // use string for name, so we could have customized allocator in execution provider.
   const char* name;
   int id;
   OrtMemType mem_type;
   OrtAllocatorType type;
+  OrtDevice device;
 
-  constexpr OrtAllocatorInfo(const char* name_, OrtAllocatorType type_, int id_ = 0, OrtMemType mem_type_ = OrtMemTypeDefault)
+  constexpr OrtAllocatorInfo(const char* name_, OrtAllocatorType type_, OrtDevice device_ = OrtDevice(), int id_ = 0, OrtMemType mem_type_ = OrtMemTypeDefault)
 #if (defined(__GNUC__) || defined(__clang__))
       __attribute__((nonnull))
 #endif
       : name(name_),
         id(id_),
         mem_type(mem_type_),
-        type(type_) {
+        type(type_),
+        device(device_) {
   }
 
   // To make OrtAllocatorInfo become a valid key in std map
@@ -67,6 +126,8 @@ std::ostream& operator<<(std::ostream& out, const OrtAllocatorInfo& info);
 
 namespace onnxruntime {
 constexpr const char* CPU = "Cpu";
+constexpr const char* CUDA = "Cuda";
+constexpr const char* CUDA_PINNED = "CudaPinned";
 
 // forward declaration
 class SessionState;
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index d3b1aa5d75..6e7c919601 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -84,20 +84,6 @@ class IExecutionProvider {
   */
   virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const;
 
-  /**
-   * Copy tensor between execution providers.  It's always a deep copy
-   * Either src.location is CPU, or dst.location is CPU. They can't be both on CPU.
-   */
-  virtual common::Status CopyTensor(const Tensor& src, Tensor& dst) const = 0;
-
-  /**
-   * Copy tensor between execution providers on specified exec queue
-   * It's always a deep copy
-   * Either src.location is CPU, or dst.location is CPU. They can't be both on CPU.
-   */
-  virtual common::Status CopyTensor(const Tensor& src, Tensor& dst,
-                                    int exec_queue_id) const;
-
   /**
      Returns an opaque handle whose exact type varies based on the provider
      and is interpreted accordingly by the corresponding kernel implementation.
diff --git a/include/onnxruntime/core/framework/kernel_registry.h b/include/onnxruntime/core/framework/kernel_registry.h
index 0b956f2cd2..3a0d35e298 100644
--- a/include/onnxruntime/core/framework/kernel_registry.h
+++ b/include/onnxruntime/core/framework/kernel_registry.h
@@ -24,9 +24,12 @@ class KernelRegistry {
   // for its clients unless the factory is managing the lifecycle of the pointer
   // itself.
   // TODO(Task:132) Make usage of unique_ptr/shared_ptr as out param consistent
-  Status TryCreateKernel(const onnxruntime::Node& node, const IExecutionProvider& execution_provider,
+  Status TryCreateKernel(const onnxruntime::Node& node,
+                         const IExecutionProvider& execution_provider,
                          const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
-                         const OrtValueNameIdxMap& mlvalue_name_idx_map, const FuncManager& funcs_mgr,
+                         const OrtValueNameIdxMap& mlvalue_name_idx_map,
+                         const FuncManager& funcs_mgr,
+                         const DataTransferManager& data_transfer_mgr,
                          std::unique_ptr<OpKernel>& op_kernel) const;
 
   // Check if an execution provider can create kernel for a node and return
diff --git a/include/onnxruntime/core/framework/op_kernel_info.h b/include/onnxruntime/core/framework/op_kernel_info.h
index 22941df4f4..e377f0d4e4 100644
--- a/include/onnxruntime/core/framework/op_kernel_info.h
+++ b/include/onnxruntime/core/framework/op_kernel_info.h
@@ -15,16 +15,20 @@ namespace onnxruntime {
 
 class OrtValueNameIdxMap;
 class FuncManager;
+class DataTransferManager;
 
 // A very light-weight class, which works as an aggregated
 // view of all data needed for constructing a Kernel instance.
 // NOTE: it does not own/hold any objects.
 class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
  public:
-  explicit OpKernelInfo(const onnxruntime::Node& node, const KernelDef& kernel_def,
+  explicit OpKernelInfo(const onnxruntime::Node& node,
+                        const KernelDef& kernel_def,
                         const IExecutionProvider& execution_provider,
                         const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
-                        const OrtValueNameIdxMap& mlvalue_name_idx_map, const FuncManager& funcs_mgr);
+                        const OrtValueNameIdxMap& mlvalue_name_idx_map,
+                        const FuncManager& funcs_mgr,
+                        const DataTransferManager& data_transfer_mgr);
 
   OpKernelInfo(const OpKernelInfo& other);
 
@@ -36,6 +40,8 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
 
   const IExecutionProvider* GetExecutionProvider() const noexcept;
 
+  const DataTransferManager& GetDataTransferManager() const noexcept;
+
   const onnxruntime::Node& node() const noexcept;
 
   bool TryGetConstantInput(int input_index, const Tensor** constant_input_value) const;
@@ -56,6 +62,7 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
   const std::unordered_map<int, OrtValue>& constant_initialized_tensors_;
   const OrtValueNameIdxMap& ort_value_name_idx_map_;
   const FuncManager& funcs_mgr_;
+  const DataTransferManager& data_transfer_mgr_;
   ProtoHelperNodeContext proto_helper_context_;
 };
 
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index c136465b71..aebda21ae2 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -18,7 +18,7 @@ void* CPUAllocator::Alloc(size_t size) {
 #elif defined(__AVX__)
   size_t alignment = 32;
 #else
-  size_t alignment = 32; //Indeed, the default one(8 or 16) should be enough
+  size_t alignment = 32;  //Indeed, the default one(8 or 16) should be enough
 #endif
 #if _MSC_VER
   p = _aligned_malloc(size, alignment);
@@ -52,7 +52,15 @@ std::ostream& operator<<(std::ostream& out, const OrtAllocatorInfo& info) {
 
 ORT_API_STATUS_IMPL(OrtCreateAllocatorInfo, _In_ const char* name1, OrtAllocatorType type, int id1,
                     OrtMemType mem_type1, _Out_ OrtAllocatorInfo** out) {
-  *out = new OrtAllocatorInfo(name1, type, id1, mem_type1);
+  if (strcmp(name1, onnxruntime::CPU) == 0) {
+    *out = new OrtAllocatorInfo(name1, type, OrtDevice(), id1, mem_type1);
+  } else if (strcmp(name1, onnxruntime::CUDA) == 0) {
+    *out = new OrtAllocatorInfo(name1, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1, mem_type1);
+  } else if (strcmp(name1, onnxruntime::CUDA_PINNED) == 0) {
+    *out = new OrtAllocatorInfo(name1, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)), id1, mem_type1);
+  } else {
+    return OrtCreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported.");
+  }
   return nullptr;
 }
 
diff --git a/onnxruntime/core/framework/arena.h b/onnxruntime/core/framework/arena.h
index a4df32fe95..24872e5026 100644
--- a/onnxruntime/core/framework/arena.h
+++ b/onnxruntime/core/framework/arena.h
@@ -37,7 +37,7 @@ class DummyArena : public IArenaAllocator {
  public:
   explicit DummyArena(std::unique_ptr<IDeviceAllocator> resource_allocator)
       : allocator_(std::move(resource_allocator)),
-        info_(allocator_->Info().name, OrtAllocatorType::OrtArenaAllocator, allocator_->Info().id) {
+        info_(allocator_->Info().name, OrtAllocatorType::OrtArenaAllocator, allocator_->Info().device, allocator_->Info().id) {
   }
 
   ~DummyArena() override = default;
diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
index eecdac5a6e..a795d28b4c 100644
--- a/onnxruntime/core/framework/bfc_arena.cc
+++ b/onnxruntime/core/framework/bfc_arena.cc
@@ -9,7 +9,7 @@ BFCArena::BFCArena(std::unique_ptr<IDeviceAllocator> resource_allocator,
     : device_allocator_(std::move(resource_allocator)),
       free_chunks_list_(kInvalidChunkHandle),
       next_allocation_id_(1),
-      info_(device_allocator_->Info().name, OrtAllocatorType::OrtArenaAllocator, device_allocator_->Info().id, device_allocator_->Info().mem_type) {
+      info_(device_allocator_->Info().name, OrtAllocatorType::OrtArenaAllocator, device_allocator_->Info().device, device_allocator_->Info().id, device_allocator_->Info().mem_type) {
   curr_region_allocation_bytes_ = RoundedBytes(std::min(total_memory, size_t{1048576}));
 
   // Allocate the requested amount of memory.
diff --git a/onnxruntime/core/framework/data_transfer.cc b/onnxruntime/core/framework/data_transfer.cc
new file mode 100644
index 0000000000..c028a12202
--- /dev/null
+++ b/onnxruntime/core/framework/data_transfer.cc
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/data_transfer.h"
+
+namespace onnxruntime {
+
+common::Status IDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
+  return CopyTensor(src, dst, 0);
+}
+
+bool CPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
+  return src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU;
+}
+
+common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /*exec_queue_id*/) const {
+  const void* src_data = src.DataRaw();
+  void* dst_data = dst.MutableDataRaw();
+  if (src_data == dst_data) {
+    // no need copying as both pointers are referring to same piece of memory.
+    return Status::OK();
+  }
+  // Copying only happens between two same size tensors.
+  ORT_ENFORCE(src.Size() == dst.Size());
+  memcpy(dst_data, src_data, src.Size());
+  return Status::OK();
+}
+
+};  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/data_transfer.h b/onnxruntime/core/framework/data_transfer.h
new file mode 100644
index 0000000000..1e707b196b
--- /dev/null
+++ b/onnxruntime/core/framework/data_transfer.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/status.h"
+#include "core/framework/tensor.h"
+
+namespace onnxruntime {
+
+// Data transfer interface.
+class IDataTransfer {
+ public:
+  virtual ~IDataTransfer() = default;
+
+  virtual bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const = 0;
+
+  virtual common::Status CopyTensor(const Tensor& src, Tensor& dst) const;
+  virtual common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const = 0;
+};
+
+class CPUDataTransfer : public IDataTransfer {
+ public:
+  CPUDataTransfer() = default;
+  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
+  common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/data_transfer_manager.cc b/onnxruntime/core/framework/data_transfer_manager.cc
new file mode 100644
index 0000000000..3f07ac7559
--- /dev/null
+++ b/onnxruntime/core/framework/data_transfer_manager.cc
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/data_transfer_manager.h"
+
+namespace onnxruntime {
+using namespace common;
+
+Status DataTransferManager::RegisterDataTransfer(std::unique_ptr<IDataTransfer> data_transfer) {
+  if (nullptr == data_transfer) {
+    return Status(ONNXRUNTIME, INVALID_ARGUMENT, "data_transfer registered is nullptr.");
+  }
+  datatransfers_.push_back(std::move(data_transfer));
+  return Status::OK();
+}
+
+const IDataTransfer* DataTransferManager::GetDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const {
+  for (auto& data_transfer : datatransfers_) {
+    if (!data_transfer->CanCopy(src_device, dst_device)) {
+      continue;
+    }
+
+    return data_transfer.get();
+  }
+  return nullptr;
+}
+
+
+Status DataTransferManager::CopyTensor(const Tensor& src, Tensor& dst) const {
+  return CopyTensor(src, dst, 0);
+}
+
+Status DataTransferManager::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const {
+  if (src.Shape().Size() != dst.Shape().Size()) {
+    return Status(ONNXRUNTIME, FAIL, "Tensor size mismatch");
+  }
+
+  for (auto& data_transfer : datatransfers_) {
+    if (!data_transfer->CanCopy(src.Location().device, dst.Location().device)) {
+      continue;
+    }
+
+    return data_transfer->CopyTensor(src, dst, exec_queue_id);
+  }
+
+  return ORT_MAKE_STATUS(ONNXRUNTIME,
+                         FAIL,
+                         "There's no data transfer registered for copying tensors from ",
+                         src.Location().device.ToString(),
+                         " to ",
+                         dst.Location().device.ToString());
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/data_transfer_manager.h b/onnxruntime/core/framework/data_transfer_manager.h
new file mode 100644
index 0000000000..35c85a0ba1
--- /dev/null
+++ b/onnxruntime/core/framework/data_transfer_manager.h
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/status.h"
+#include "core/framework/data_transfer.h"
+#include "core/framework/tensor.h"
+
+namespace onnxruntime {
+
+// Data transfer manager, which has all functions registered to copy tensors with different location.
+// It's not thread-safe.
+class DataTransferManager {
+ public:
+  DataTransferManager() = default;
+  //static DataTransferManager& Instance();
+
+  common::Status RegisterDataTransfer(std::unique_ptr<IDataTransfer> data_transfer);
+
+  const IDataTransfer* GetDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const;
+
+  common::Status CopyTensor(const Tensor& src, Tensor& dst) const;
+  common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const;
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(DataTransferManager);
+
+  // It's assumed that data transfers in this array have no overlap in terms of copying functionality.
+  std::vector<std::unique_ptr<IDataTransfer>> datatransfers_;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index 02d8949675..c86f3e225e 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -43,14 +43,6 @@ IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   return result;
 }
 
-common::Status IExecutionProvider::CopyTensor(const Tensor& src,
-                                              Tensor& dst,
-                                              int exec_queue_id) const {
-  // execution provider may override this to support different exec queues
-  ORT_ENFORCE(exec_queue_id == 0);
-  return CopyTensor(src, dst);
-}
-
 common::Status IExecutionProvider::Sync() const { return Status::OK(); };
 
 common::Status IExecutionProvider::OnRunStart() { return Status::OK(); }
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index c73cfb999d..cfc545d367 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -240,9 +240,12 @@ Status KernelRegistry::Register(KernelCreateInfo&& create_info) {
   return Status::OK();
 }
 
-Status KernelRegistry::TryCreateKernel(const onnxruntime::Node& node, const IExecutionProvider& execution_provider,
+Status KernelRegistry::TryCreateKernel(const onnxruntime::Node& node,
+                                       const IExecutionProvider& execution_provider,
                                        const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
-                                       const OrtValueNameIdxMap& ort_value_name_idx_map, const FuncManager& funcs_mgr,
+                                       const OrtValueNameIdxMap& ort_value_name_idx_map,
+                                       const FuncManager& funcs_mgr,
+                                       const DataTransferManager& data_transfer_mgr,
                                        /*out*/ std::unique_ptr<OpKernel>& op_kernel) const {
   const KernelCreateInfo* kernel_create_info = TryFindKernel(node, execution_provider.Type());
 
@@ -250,8 +253,13 @@ Status KernelRegistry::TryCreateKernel(const onnxruntime::Node& node, const IExe
     return Status(ONNXRUNTIME, FAIL, "Failed to find kernel for " + node.OpType());
   }
 
-  OpKernelInfo kernel_info(node, *kernel_create_info->kernel_def, execution_provider, constant_initialized_tensors,
-                           ort_value_name_idx_map, funcs_mgr);
+  OpKernelInfo kernel_info(node,
+                           *kernel_create_info->kernel_def,
+                           execution_provider,
+                           constant_initialized_tensors,
+                           ort_value_name_idx_map,
+                           funcs_mgr,
+                           data_transfer_mgr);
   op_kernel.reset(kernel_create_info->kernel_create_func(kernel_info));
   return Status::OK();
 }
diff --git a/onnxruntime/core/framework/kernel_registry_manager.cc b/onnxruntime/core/framework/kernel_registry_manager.cc
index 8722f975ab..5fe803b368 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.cc
+++ b/onnxruntime/core/framework/kernel_registry_manager.cc
@@ -23,7 +23,7 @@ Status KernelRegistryManager::CreateKernel(const onnxruntime::Node& node,
   {
     for (auto& registry : custom_kernel_registries_) {
       status = registry->TryCreateKernel(node, execution_provider, session_state.GetConstantInitializedTensors(),
-                                         session_state.GetOrtValueNameIdxMap(), session_state.GetFuncMgr(), op_kernel);
+                                         session_state.GetOrtValueNameIdxMap(), session_state.GetFuncMgr(), session_state.GetDataTransferMgr(), op_kernel);
       if (status.IsOK()) {
         return status;
       }
@@ -35,7 +35,7 @@ Status KernelRegistryManager::CreateKernel(const onnxruntime::Node& node,
   if (iter != provider_type_to_registry_.end()) p = iter->second.get();
   if (p != nullptr) {
     status = p->TryCreateKernel(node, execution_provider, session_state.GetConstantInitializedTensors(),
-                                session_state.GetOrtValueNameIdxMap(), session_state.GetFuncMgr(), op_kernel);
+                                session_state.GetOrtValueNameIdxMap(), session_state.GetFuncMgr(), session_state.GetDataTransferMgr(), op_kernel);
     if (status.IsOK()) {
       return status;
     }
diff --git a/onnxruntime/core/framework/memcpy.cc b/onnxruntime/core/framework/memcpy.cc
index a7847c2f07..3c63cf233d 100644
--- a/onnxruntime/core/framework/memcpy.cc
+++ b/onnxruntime/core/framework/memcpy.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/framework/data_transfer_manager.h"
 #include "memcpy.h"
 using namespace ONNX_NAMESPACE;
 namespace onnxruntime {
@@ -13,7 +14,7 @@ Memcpy::Memcpy(const OpKernelInfo& info)
 Status Memcpy::Compute(OpKernelContext* ctx) const {
   const auto* X = ctx->Input<Tensor>(0);
   Tensor* Y = ctx->Output(0, X->Shape());
-  Status retval = provider_->CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId());
+  Status retval = Info().GetDataTransferManager().CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId());
   return retval;
 }
 
diff --git a/onnxruntime/core/framework/op_kernel_info.cc b/onnxruntime/core/framework/op_kernel_info.cc
index 41fe6e1ca5..bb3c101e36 100644
--- a/onnxruntime/core/framework/op_kernel_info.cc
+++ b/onnxruntime/core/framework/op_kernel_info.cc
@@ -8,10 +8,13 @@
 
 namespace onnxruntime {
 
-OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node, const KernelDef& kernel_def,
+OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
+                           const KernelDef& kernel_def,
                            const IExecutionProvider& execution_provider,
                            const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
-                           const OrtValueNameIdxMap& ort_value_name_idx_map, const FuncManager& funcs_mgr)
+                           const OrtValueNameIdxMap& ort_value_name_idx_map,
+                           const FuncManager& funcs_mgr,
+                           const DataTransferManager& data_transfer_mgr)
     : OpNodeProtoHelper(&proto_helper_context_),
       node_(node),
       kernel_def_(kernel_def),
@@ -19,11 +22,12 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node, const KernelDef& kerne
       constant_initialized_tensors_(constant_initialized_tensors),
       ort_value_name_idx_map_(ort_value_name_idx_map),
       funcs_mgr_(funcs_mgr),
+      data_transfer_mgr_(data_transfer_mgr),
       proto_helper_context_(node) {}
 
 OpKernelInfo::OpKernelInfo(const OpKernelInfo& other)
     : OpKernelInfo(other.node_, other.kernel_def_, *other.execution_provider_, other.constant_initialized_tensors_,
-                   other.ort_value_name_idx_map_, other.funcs_mgr_) {}
+                   other.ort_value_name_idx_map_, other.funcs_mgr_, other.data_transfer_mgr_) {}
 
 const OrtAllocatorInfo& OpKernelInfo::GetAllocatorInfo(int device_id, OrtMemType mem_type) const {
   AllocatorPtr alloc = GetAllocator(device_id, mem_type);
@@ -43,6 +47,10 @@ const IExecutionProvider* OpKernelInfo::GetExecutionProvider() const noexcept {
   return execution_provider_;
 }
 
+const DataTransferManager& OpKernelInfo::GetDataTransferManager() const noexcept {
+  return data_transfer_mgr_;
+}
+
 const onnxruntime::Node& OpKernelInfo::node() const noexcept {
   return node_;
 }
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index 723304b997..838fab27bb 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -14,6 +14,7 @@
 #include "core/common/logging/logging.h"
 #include "core/common/profiler.h"
 #include "core/framework/allocation_planner.h"
+#include "core/framework/data_transfer_manager.h"
 #include "core/framework/execution_providers.h"
 #include "core/framework/feeds_fetches_manager.h"
 #include "core/framework/kernel_registry_manager.h"
@@ -182,8 +183,10 @@ class SessionState {
   const FuncManager& GetFuncMgr() const { return fused_funcs_mgr_; }
   FuncManager& GetMutableFuncMgr() { return fused_funcs_mgr_; }
 
-  std::vector<BufferUniquePtr>& GetMutableWeightsBuffers() { return weights_buffers_; }
+  const DataTransferManager& GetDataTransferMgr() const { return *data_transfer_mgr_; }
+  void SetDataTransferMgr(const DataTransferManager* data_transfer_mgr) { data_transfer_mgr_ = data_transfer_mgr; }
 
+  std::vector<BufferUniquePtr>& GetMutableWeightsBuffers() { return weights_buffers_; }
   void CalculateNodeIndexInfo();
   const NodeIndexInfo& GetNodeIndexInfo() const;
 
@@ -232,6 +235,7 @@ class SessionState {
 
   bool export_fused_dll_ = false;
   FuncManager fused_funcs_mgr_;
+  const DataTransferManager* data_transfer_mgr_;
 
   std::unique_ptr<NodeIndexInfo> node_index_info_;
   std::multimap<int, std::unique_ptr<FeedsFetchesManager>> cached_feeds_fetches_managers_;
diff --git a/onnxruntime/core/framework/session_state_initializer.cc b/onnxruntime/core/framework/session_state_initializer.cc
index 7c8d7f2443..3911af9bda 100644
--- a/onnxruntime/core/framework/session_state_initializer.cc
+++ b/onnxruntime/core/framework/session_state_initializer.cc
@@ -12,6 +12,7 @@
 #include "core/common/logging/logging.h"
 
 #include "core/graph/graph_viewer.h"
+#include "core/framework/data_transfer_manager.h"
 #include "core/graph/graph_utils.h"
 #include "core/framework/graph_partitioner.h"
 #include "core/framework/ml_value.h"
@@ -36,7 +37,8 @@ static common::Status SaveInitializedTensors(const Env& env, const std::basic_st
                                              const onnxruntime::Graph& graph, const ExecutionProviders& exec_providers,
                                              const OrtValueNameIdxMap& ort_value_name_idx_map,
                                              ITensorAllocator* planner, const T& save_tensor_func,
-                                             const logging::Logger& logger);
+                                             const logging::Logger& logger,
+                                             const DataTransferManager& data_transfer_mgr);
 
 static common::Status SaveKernels(const ExecutionProviders& execution_providers,
                                   SessionState& session_state,
@@ -111,8 +113,7 @@ common::Status SessionStateInitializer::InitializeAndSave(
       [this](int idx, const OrtValue& value, const OrtCallback& d, bool constant) -> Status {
         return session_state_.AddInitializedTensor(idx, value, &d, constant);
       },
-      logger_));
-
+      logger_, session_state_.GetDataTransferMgr()));
   // remove weights from the graph now to save memory but in many cases it won't save memory, if the tensor was
   // preallocated with the some other tensors in a single 'allocate' call, which is very common.
   // TODO: make it better
@@ -180,7 +181,8 @@ common::Status SaveMLValueNameIndexMapping(const GraphViewer& graph_viewer, OrtV
 static common::Status DeserializeTensorProto(const Env& env, const std::basic_string<PATH_CHAR_TYPE>& proto_path,
                                              const ONNX_NAMESPACE::TensorProto& tensor_proto, const MemBuffer& m,
                                              const ExecutionProviders& exec_providers, OrtValue& ort_value,
-                                             OrtCallback& deleter) {
+                                             OrtCallback& deleter,
+                                             const DataTransferManager& data_transfer_mgr) {
   const OrtAllocatorInfo& alloc_info = m.GetAllocInfo();
   if (strcmp(alloc_info.name, CPU) == 0 || alloc_info.mem_type == OrtMemTypeCPUOutput) {
     // deserialize directly to CPU tensor
@@ -219,7 +221,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
   p_tensor = std::make_unique<Tensor>(p_deserialize_tensor.DataType(), p_deserialize_tensor.Shape(), m.GetBuffer(),
                                       m.GetAllocInfo());
   // TODO: does this function work for string tensor?
-  Status copy_status = provider->CopyTensor(p_deserialize_tensor, *p_tensor);
+  Status copy_status = data_transfer_mgr.CopyTensor(p_deserialize_tensor, *p_tensor);
   if (d.f) d.f(d.param);
   if (!copy_status.IsOK()) {
     if (copy_status.ErrorMessage().empty()) {
@@ -239,7 +241,8 @@ template <typename T>
 common::Status SaveInitializedTensors(const Env& env, const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
                                       const Graph& graph, const ExecutionProviders& exec_providers,
                                       const OrtValueNameIdxMap& ort_value_name_idx_map, ITensorAllocator* planner,
-                                      const T& save_tensor_func, const logging::Logger& logger) {
+                                      const T& save_tensor_func, const logging::Logger& logger,
+                                      const DataTransferManager& data_transfer_mgr) {
   LOGS(logger, INFO) << "Saving initialized tensors.";
   ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > 0, "OrtValue indexes should have been populated.");
 
@@ -272,7 +275,7 @@ common::Status SaveInitializedTensors(const Env& env, const std::basic_string<PA
     ORT_ENFORCE(m->GetBuffer() != nullptr || m->GetLen() == 0);
 #endif
     OrtValue ort_value;
-    Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, *m, exec_providers, ort_value, deleter);
+    Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, *m, exec_providers, ort_value, deleter, data_transfer_mgr);
     if (!st.IsOK()) {
       std::ostringstream oss;
       oss << "Deserialize tensor " << name << " failed." << st.ErrorMessage();
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 65a64a3c78..82b160a704 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -6,7 +6,7 @@
 #include <iomanip>
 
 #include "core/graph/graph_viewer.h"
-
+#include "core/framework/data_transfer_manager.h"
 #include "core/framework/execution_frame.h"
 #include "core/framework/execution_providers.h"
 #include "core/framework/feeds_fetches_manager.h"
@@ -58,7 +58,9 @@ const std::string& GetNodeInputProviderType(const SessionState::NodeInfo& info)
   return required_provider_type;
 }
 
-static Status CopyMLValue(const FeedsFetchesManager::MLValueCopyInfo& copy_info, const OrtValue& source_mlvalue,
+static Status CopyMLValue(const DataTransferManager& data_transfer_mgr,
+                          const FeedsFetchesManager::MLValueCopyInfo& copy_info,
+                          const OrtValue& source_mlvalue,
                           OrtValue& target_mlvalue) {
   if (copy_info.copy_provider == nullptr) {
     target_mlvalue = source_mlvalue;
@@ -72,7 +74,7 @@ static Status CopyMLValue(const FeedsFetchesManager::MLValueCopyInfo& copy_info,
 
     Tensor* p_output_tensor = target_mlvalue.GetMutable<Tensor>();
 
-    ORT_RETURN_IF_ERROR(copy_info.copy_provider->CopyTensor(source_tensor, *p_output_tensor));
+    ORT_RETURN_IF_ERROR(data_transfer_mgr.CopyTensor(source_tensor, *p_output_tensor));
   }
 
   return Status::OK();
@@ -150,7 +152,7 @@ common::Status CopyOneInputAcrossDevices(const SessionState& session_state, cons
     copy_info.allocation_provider = required_provider;
     copy_info.copy_provider = p_copy_provider;
 
-    ORT_RETURN_IF_ERROR(CopyMLValue(copy_info, orig_mlvalue, new_mlvalue));
+    ORT_RETURN_IF_ERROR(CopyMLValue(session_state.GetDataTransferMgr(), copy_info, orig_mlvalue, new_mlvalue));
 
     needed_copy = true;
 
@@ -205,14 +207,15 @@ static common::Status CopyInputsAcrossDevices(const SessionState& session_state,
 // copies inputs across devices only if required using cached copy_info
 static common::Status CachedCopyInputsAcrossDevices(
     const std::vector<OrtValue>& orig_feeds, std::vector<OrtValue>& new_feeds,
-    const std::vector<FeedsFetchesManager::MLValueCopyInfo>& copy_info) {
+    const std::vector<FeedsFetchesManager::MLValueCopyInfo>& copy_info,
+    const DataTransferManager& data_transfer_mgr) {
   size_t num_feeds = orig_feeds.size();
   ORT_ENFORCE(copy_info.size() == num_feeds);
 
   new_feeds.resize(num_feeds);
 
   for (size_t idx = 0; idx < num_feeds; ++idx) {
-    ORT_RETURN_IF_ERROR(CopyMLValue(copy_info[idx], orig_feeds[idx], new_feeds[idx]));
+    ORT_RETURN_IF_ERROR(CopyMLValue(data_transfer_mgr, copy_info[idx], orig_feeds[idx], new_feeds[idx]));
   }
 
   return Status::OK();
@@ -379,7 +382,7 @@ static common::Status CopyOutputsAcrossDevices(const SessionState& session_state
 
     const int device_id = 0;  // TODO: As per comment in the copy input code, make this configurable.
     FeedsFetchesManager::MLValueCopyInfo copy_info{device_id, p_output_provider, p_copy_provider};
-    ORT_RETURN_IF_ERROR(CopyMLValue(copy_info, fetched_mlvalue, output_mlvalue));
+    ORT_RETURN_IF_ERROR(CopyMLValue(session_state.GetDataTransferMgr(), copy_info, fetched_mlvalue, output_mlvalue));
 
     if (copiers) {
       (*copiers)[idx] = copy_info;
@@ -391,7 +394,8 @@ static common::Status CopyOutputsAcrossDevices(const SessionState& session_state
 
 static common::Status CachedCopyOutputsAcrossDevices(
     const std::vector<OrtValue>& fetches, std::vector<OrtValue>& user_fetches,
-    const std::vector<FeedsFetchesManager::MLValueCopyInfo>& copy_info) {
+    const std::vector<FeedsFetchesManager::MLValueCopyInfo>& copy_info,
+    const DataTransferManager& data_transfer_mgr) {
   auto num_outputs = fetches.size();
 
   // internal logic error if these are mismatched
@@ -399,7 +403,7 @@ static common::Status CachedCopyOutputsAcrossDevices(
 
   // used the cached copy logic if available
   for (size_t idx = 0; idx < num_outputs; ++idx) {
-    ORT_RETURN_IF_ERROR(CopyMLValue(copy_info[idx], fetches[idx], user_fetches[idx]));
+    ORT_RETURN_IF_ERROR(CopyMLValue(data_transfer_mgr, copy_info[idx], fetches[idx], user_fetches[idx]));
   }
 
   return Status::OK();
@@ -456,7 +460,8 @@ common::Status ExecuteGraphWithCachedInfo(
     // Copy inputs
     if (device_copy_checks.input_copy_needed == DeviceCopyCheck::Copy) {
       ORT_RETURN_IF_ERROR(CachedCopyInputsAcrossDevices(feeds, device_feeds,
-                                                        feeds_fetches_manager.GetFeedsDeviceCopiers()));
+                                                        feeds_fetches_manager.GetFeedsDeviceCopiers(),
+                                                        session_state.GetDataTransferMgr()));
       p_feeds = &device_feeds;
     }
 
@@ -480,7 +485,8 @@ common::Status ExecuteGraphWithCachedInfo(
 
     if (device_copy_checks.output_copy_needed == DeviceCopyCheck::Copy) {
       ORT_RETURN_IF_ERROR(CachedCopyOutputsAcrossDevices(*p_fetches, fetches,
-                                                         feeds_fetches_manager.GetFetchesDeviceCopiers()));
+                                                         feeds_fetches_manager.GetFetchesDeviceCopiers(),
+                                                         session_state.GetDataTransferMgr()));
     }
   }
 
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.cc b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
index 57d5bcbb7b..cd64d23982 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
@@ -3,6 +3,7 @@
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
 #include "core/common/logging/macros.h"
+#include "core/framework/data_transfer_manager.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/data_types.h"
 #include "core/framework/mldata_type_utils.h"
@@ -22,6 +23,8 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
   allocator_ptr_ = cpu_execution_provider_->GetAllocator(device_id_, mem_type_);
   ORT_ENFORCE(allocator_ptr_ != nullptr, "Failed to get allocator for optimizer");
 
+  data_transfer_mgr_.RegisterDataTransfer(std::make_unique<CPUDataTransfer>());
+
   // Create MLValues related maps
   auto initialize_maps = [this, &initialized_tensor_set](const NodeArg& arg, size_t /*index*/) -> Status {
     int idx = ort_value_name_idx_map_.Add(arg.Name());
@@ -63,7 +66,7 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
     std::unique_ptr<OpKernel> op_kernel;
     std::shared_ptr<KernelRegistry> kernel_registry = cpu_execution_provider_->GetKernelRegistry();
     auto status = kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
-                                                   ort_value_name_idx_map_, FuncManager(), op_kernel);
+                                                   ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_, op_kernel);
     kernels_[node->Index()] = std::move(op_kernel);
   }
 }
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.h b/onnxruntime/core/optimizer/optimizer_execution_frame.h
index 7d47ff50d9..41f5e85215 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.h
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.h
@@ -7,12 +7,14 @@
 
 #include "core/graph/graph.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
+#include "core/framework/data_transfer_manager.h"
 #include "core/framework/execution_frame.h"
 #include "core/framework/ort_value_name_idx_map.h"
 #include "core/framework/ml_value.h"
 #include "core/common/callback.h"
 
 namespace onnxruntime {
+class DataTransferManager;
 
 class OptimizerExecutionFrame final : public IExecutionFrame {
  public:
@@ -54,7 +56,7 @@ class OptimizerExecutionFrame final : public IExecutionFrame {
     const int device_id_{0};
     const OrtMemType mem_type_{OrtMemTypeDefault};
     AllocatorPtr allocator_ptr_;
-
+    DataTransferManager data_transfer_mgr_;
     // MLValues for optimizer
     OrtValueNameIdxMap ort_value_name_idx_map_;
     std::unordered_map<int, const NodeArg*> ort_value_idx_nodearg_map_;
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.h b/onnxruntime/core/providers/cpu/cpu_execution_provider.h
index b72244510e..5dfdeb0c2c 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.h
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.h
@@ -46,10 +46,6 @@ class CPUExecutionProvider : public IExecutionProvider {
 #endif
   }
 
-  Status CopyTensor(const Tensor&, Tensor&) const override {
-    return Status(common::ONNXRUNTIME, common::FAIL, "Shouldn't reach here. CPUExecutionProvider doesn't support CopyTensor");
-  }
-
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
 
 
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
index 297f188ae9..ee1f437f26 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
@@ -35,5 +35,6 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CPU, _In_ OrtSessio
 }
 
 ORT_API_STATUS_IMPL(OrtCreateCpuAllocatorInfo, enum OrtAllocatorType type, enum OrtMemType mem_type, _Out_ OrtAllocatorInfo** out) {
-  return OrtCreateAllocatorInfo(onnxruntime::CPU, type, 0, mem_type, out);
+  *out = new OrtAllocatorInfo(onnxruntime::CPU, type, OrtDevice(), 0, mem_type);
+  return nullptr;
 }
diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc
index b94d9cba9f..44cbbd75d0 100644
--- a/onnxruntime/core/providers/cuda/cuda_allocator.cc
+++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc
@@ -1,17 +1,19 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "cuda_common.h"
 #include "cuda_allocator.h"
+#include "cuda_common.h"
 #include "core/framework/allocatormgr.h"
 #include "core/framework/session_state.h"
 #include "cuda_fence.h"
+#include "gpu_data_transfer.h"
 
 namespace onnxruntime {
 
-static const CUDAExecutionProvider* GetCUDAExecutionProvider(const SessionState* session_state) {
-  return dynamic_cast<const CUDAExecutionProvider*>(
-      session_state->GetExecutionProviders().Get(onnxruntime::kCudaExecutionProvider));
+static const GPUDataTransfer* GetGPUDataTransfer(const SessionState* session_state) {
+  OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0);
+  OrtDevice cpu_device;
+  return dynamic_cast<const GPUDataTransfer*>(session_state->GetDataTransferMgr().GetDataTransfer(gpu_device, cpu_device));
 }
 
 void CUDAAllocator::CheckDevice() const {
@@ -43,7 +45,7 @@ const OrtAllocatorInfo& CUDAAllocator::Info() const {
 }
 
 FencePtr CUDAAllocator::CreateFence(const SessionState* session_state) {
-  return std::make_shared<CUDAFence>(GetCUDAExecutionProvider(session_state));
+  return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
 }
 
 void* CUDAPinnedAllocator::Alloc(size_t size) {
@@ -59,12 +61,12 @@ void CUDAPinnedAllocator::Free(void* p) {
 }
 
 const OrtAllocatorInfo& CUDAPinnedAllocator::Info() const {
-  static constexpr OrtAllocatorInfo cuda_allocator_info(CUDA_PINNED, OrtDeviceAllocator, 0, OrtMemTypeCPUOutput);
+  static constexpr OrtAllocatorInfo cuda_allocator_info(CUDA_PINNED, OrtDeviceAllocator, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, 0), 0, OrtMemTypeCPUOutput);
   return cuda_allocator_info;
 }
 
 FencePtr CUDAPinnedAllocator::CreateFence(const SessionState* session_state) {
-  return std::make_shared<CUDAFence>(GetCUDAExecutionProvider(session_state));
+  return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.h b/onnxruntime/core/providers/cuda/cuda_allocator.h
index 951b27bccb..06f6caa784 100644
--- a/onnxruntime/core/providers/cuda/cuda_allocator.h
+++ b/onnxruntime/core/providers/cuda/cuda_allocator.h
@@ -6,12 +6,10 @@
 #include "core/framework/allocator.h"
 
 namespace onnxruntime {
-constexpr const char* CUDA = "Cuda";
-constexpr const char* CUDA_PINNED = "CudaPinned";
 
 class CUDAAllocator : public IDeviceAllocator {
  public:
-  CUDAAllocator(int device_id) : info_(CUDA, OrtAllocatorType::OrtDeviceAllocator, device_id, OrtMemTypeDefault) {}
+  CUDAAllocator(int device_id) : info_(CUDA, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id), device_id, OrtMemTypeDefault) {}
   virtual void* Alloc(size_t size) override;
   virtual void Free(void* p) override;
   virtual const OrtAllocatorInfo& Info() const override;
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
index 9ce0ffbfb9..ce271297e4 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.h
+++ b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -4,6 +4,7 @@
 #pragma once
 #include "cuda_pch.h"
 #include "core/common/status.h"
+#include "core/framework/data_transfer_manager.h"
 #include "core/framework/op_kernel.h"
 #include "core/graph/graph_viewer.h"
 #include "shared_inc/cuda_call.h"
@@ -137,7 +138,7 @@ class CudaKernel : public OpKernel {
   }
 
   inline Status CopyTensor(const Tensor& src, Tensor& dst) const {
-    return provider_->CopyTensor(src, dst);
+    return Info().GetDataTransferManager().CopyTensor(src, dst);
   }
 
   inline int GetDeviceId() const { return provider_->GetDeviceId(); }
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index cd0d1c5b7d..bd7544396c 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -199,52 +199,6 @@ Status CUDAExecutionProvider::OnRunEnd() {
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst) const {
-  return CopyTensor(src, dst, kCudaStreamDefault);
-}
-
-Status CUDAExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const {
-  if (src.Shape().Size() != dst.Shape().Size()) {
-    return Status(ONNXRUNTIME, FAIL, "Tensor size mismatch");
-  }
-
-  if (strcmp(src.Location().name, CUDA) != 0 && strcmp(src.Location().name, CUDA_PINNED) != 0 &&
-      strcmp(dst.Location().name, CUDA) != 0 && strcmp(dst.Location().name, CUDA_PINNED) != 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported tensor location: src_location is: ", src.Location().name, " and dst_location is: ", dst.Location().name);
-  }
-
-  size_t bytes = src.Size();
-
-  const void* src_data = src.DataRaw();
-  void* dst_data = dst.MutableDataRaw();
-
-  if (strcmp(dst.Location().name, CUDA) == 0) {
-    if (strcmp(src.Location().name, CUDA_PINNED) == 0) {
-      // copy from pinned memory to GPU, this is non-blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id]));
-    } else if (strcmp(src.Location().name, CUDA) == 0) {
-      // copying between GPU, this is non-blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault]));
-    } else {
-      // copy from other CPU memory to GPU, this is blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
-    }
-  } else if (strcmp(src.Location().name, CUDA) == 0) {
-    if (strcmp(dst.Location().name, CUDA_PINNED) == 0) {
-      // copying from GPU to pinned memory, this is non-blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id]));
-    } else {
-      // copying from GPU to CPU memory, this is blocking
-      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
-    }
-  } else {
-    // copying between cpu memory
-    memcpy(dst_data, src_data, bytes);
-  }
-
-  return Status::OK();
-}
-
 namespace cuda {
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index b42ffc8854..ed3a509f85 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -7,6 +7,7 @@
 #include "core/graph/constants.h"
 #include "core/framework/allocatormgr.h"
 #include "core/framework/execution_provider.h"
+#include "core/providers/cuda/gpu_data_transfer.h"
 #include "shared_inc/cuda_utils.h"
 #include <deque>
 
@@ -17,13 +18,6 @@ struct CUDAExecutionProviderInfo {
   int device_id{0};
 };
 
-enum CUDAStreamType : int {
-  kCudaStreamDefault = 0,
-  kCudaStreamCopyIn,
-  kCudaStreamCopyOut,
-  kTotalCudaStreams,
-};
-
 // Logical device representation.
 class CUDAExecutionProvider : public IExecutionProvider {
  public:
@@ -38,10 +32,6 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
   Status OnRunEnd() override;
 
-  Status CopyTensor(const Tensor& src, Tensor& dst) const override;
-
-  Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override;
-
   cublasHandle_t PerThreadCublasHandle() {
     return GetPerThreadContext().CublasHandle();
   }
diff --git a/onnxruntime/core/providers/cuda/cuda_fence.cc b/onnxruntime/core/providers/cuda/cuda_fence.cc
index ce368a7daa..463a560ee8 100644
--- a/onnxruntime/core/providers/cuda/cuda_fence.cc
+++ b/onnxruntime/core/providers/cuda/cuda_fence.cc
@@ -3,10 +3,11 @@
 
 #include "cuda_common.h"
 #include "cuda_fence.h"
+#include "gpu_data_transfer.h"
 
 namespace onnxruntime {
 
-CUDAFence::CUDAFence(const CUDAExecutionProvider* provider) : provider_(provider) {
+CUDAFence::CUDAFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) {
   // NOTE: cudaEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel
   // if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU
   int event_flags = /*cudaEventBlockingSync |*/ cudaEventDisableTiming;
@@ -22,7 +23,7 @@ CUDAFence::~CUDAFence() {
 void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int async_queue_id) {
   if (provider_type == onnxruntime::kCudaExecutionProvider) {
     // sync in GPU, the call is non-blocking on CPU
-    CUDA_CALL_THROW(cudaStreamWaitEvent(provider_->GetStream(async_queue_id), write_event_, 0));
+    CUDA_CALL_THROW(cudaStreamWaitEvent(data_transfer_->GetStream(async_queue_id), write_event_, 0));
   } else {
     // sync on CPU for all other providers, this is blocking
     CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
@@ -32,7 +33,7 @@ void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int
 void CUDAFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) {
   if (provider_type == onnxruntime::kCudaExecutionProvider) {
     // sync in GPU, the call is non-blocking on CPU
-    cudaStream_t stream = provider_->GetStream(queue_id);
+    cudaStream_t stream = data_transfer_->GetStream(queue_id);
     CUDA_CALL_THROW(cudaStreamWaitEvent(stream, read_event_, 0));
     CUDA_CALL_THROW(cudaStreamWaitEvent(stream, write_event_, 0));
   } else {
@@ -49,13 +50,13 @@ bool CUDAFence::CanRelease() {
 
 void CUDAFence::AfterUsedAsInput(int queue_id) {
   // update read fence
-  cudaStream_t stream = provider_->GetStream(queue_id);
+  cudaStream_t stream = data_transfer_->GetStream(queue_id);
   CUDA_CALL_THROW(cudaEventRecord(read_event_, stream));
 }
 
 void CUDAFence::AfterUsedAsOutput(int queue_id) {
   // update write fence
-  cudaStream_t stream = provider_->GetStream(queue_id);
+  cudaStream_t stream = data_transfer_->GetStream(queue_id);
   CUDA_CALL_THROW(cudaEventRecord(write_event_, stream));
 }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_fence.h b/onnxruntime/core/providers/cuda/cuda_fence.h
index c52c55041f..e14883d278 100644
--- a/onnxruntime/core/providers/cuda/cuda_fence.h
+++ b/onnxruntime/core/providers/cuda/cuda_fence.h
@@ -3,12 +3,14 @@
 
 #pragma once
 #include "core/framework/tensor.h"
-#include "cuda_execution_provider.h"
+#include "core/graph/basic_types.h"
+
 namespace onnxruntime {
+class GPUDataTransfer;
 
 class CUDAFence : public IFence {
  public:
-  CUDAFence(const CUDAExecutionProvider* provider);
+  CUDAFence(const GPUDataTransfer* data_transfer);
   virtual ~CUDAFence();
   virtual void BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int queue_id) override;
   virtual void BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) override;
@@ -19,7 +21,7 @@ class CUDAFence : public IFence {
  private:
   cudaEvent_t read_event_;
   cudaEvent_t write_event_;
-  const CUDAExecutionProvider* provider_;
+  const GPUDataTransfer* data_transfer_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc
new file mode 100644
index 0000000000..b80a1f8cab
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc
@@ -0,0 +1,55 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/gpu_data_transfer.h"
+#include "cuda_common.h"
+
+namespace onnxruntime {
+GPUDataTransfer::GPUDataTransfer() {
+  // create streams, default is nullptr
+  streams_[kCudaStreamDefault] = nullptr;
+  CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking));
+  CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
+}
+
+bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
+  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED
+         || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
+}
+
+common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const {
+  size_t bytes = src.Size();
+  const void* src_data = src.DataRaw();
+  void* dst_data = dst.MutableDataRaw();
+
+  auto& src_device = src.Location().device;
+  auto& dst_device = dst.Location().device;
+
+  if (dst_device.Type() == OrtDevice::GPU) {
+    if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
+      // copy from pinned memory to GPU, this is non-blocking
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id]));
+    } else if (src_device.Type() == OrtDevice::GPU) {
+      // copying between GPU, this is non-blocking
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault]));
+    } else {
+      // copy from other CPU memory to GPU, this is blocking
+      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
+    }
+  } else if (src_device.Type() == OrtDevice::GPU) {
+    if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
+      // copying from GPU to pinned memory, this is non-blocking
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id]));
+    } else {
+      // copying from GPU to CPU memory, this is blocking
+      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
+    }
+  } else {
+    // copying between cpu memory
+    memcpy(dst_data, src_data, bytes);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h
new file mode 100644
index 0000000000..f0acae0f8d
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "cuda_pch.h"
+#include "core/framework/data_transfer.h"
+
+namespace onnxruntime {
+
+enum CUDAStreamType : int {
+  kCudaStreamDefault = 0,
+  kCudaStreamCopyIn,
+  kCudaStreamCopyOut,
+  kTotalCudaStreams,
+};
+
+class GPUDataTransfer : public IDataTransfer {
+ public:
+  GPUDataTransfer();
+
+  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
+
+  common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override;
+
+  cudaStream_t GetStream(int queue_id) const {
+    ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams);
+    return streams_[queue_id];
+  }
+
+ private:
+  cudaStream_t streams_[kTotalCudaStreams];
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc
index 307cdb7454..a9ea7ca581 100644
--- a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc
+++ b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc
@@ -41,10 +41,10 @@ ONNX_OPERATOR_KERNEL_EX(
 MKLDNNExecutionProvider::MKLDNNExecutionProvider(const MKLDNNExecutionProviderInfo& info)
     : IExecutionProvider{onnxruntime::kMklDnnExecutionProvider} {
   DeviceAllocatorRegistrationInfo default_allocator_info({OrtMemTypeDefault,
-                                                          [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(MKLDNN, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemTypeDefault)); }, std::numeric_limits<size_t>::max()});
+                                                          [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(MKLDNN, OrtAllocatorType::OrtDeviceAllocator)); }, std::numeric_limits<size_t>::max()});
 
   DeviceAllocatorRegistrationInfo cpu_allocator_info({OrtMemTypeCPUOutput,
-                                                      [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(MKLDNN_CPU, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemTypeCPUOutput)); }, std::numeric_limits<size_t>::max()});
+                                                      [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(MKLDNN_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput)); }, std::numeric_limits<size_t>::max()});
 
   if (info.create_arena) {
     InsertAllocator(CreateAllocator(default_allocator_info));
@@ -62,23 +62,6 @@ MKLDNNExecutionProvider::MKLDNNExecutionProvider(const MKLDNNExecutionProviderIn
 MKLDNNExecutionProvider::~MKLDNNExecutionProvider() {
 }
 
-Status MKLDNNExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst) const {
-  // Support CPU <-> MKLDNN for now
-  if (!(strcmp(src.Location().name, MKLDNN) == 0 && strcmp(dst.Location().name, CPU) == 0) &&
-      !(strcmp(src.Location().name, CPU) == 0 && strcmp(dst.Location().name, MKLDNN) == 0) &&
-      !(strcmp(src.Location().name, MKLDNN) == 0 && strcmp(dst.Location().name, MKLDNN_CPU) == 0)) {
-    ORT_NOT_IMPLEMENTED(src.Location().name, " copy to ", dst.Location().name, " is not implemented");
-  }
-
-  // Todo: Copy for now. May optimize later to avoid copy.
-  size_t bytes = src.DataType()->Size() * src.Shape().Size();
-  const void* src_data = src.DataRaw();
-  void* dst_data = dst.MutableDataRaw();
-  memcpy(dst_data, src_data, bytes);
-
-  return Status::OK();
-}
-
 namespace mkl_dnn {
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 1, Conv);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 7, Gemm);
diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h
index 7e844adce5..3e173e7a79 100644
--- a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h
+++ b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h
@@ -35,8 +35,6 @@ class MKLDNNExecutionProvider : public IExecutionProvider {
   explicit MKLDNNExecutionProvider(const MKLDNNExecutionProviderInfo& info);
   virtual ~MKLDNNExecutionProvider();
 
-  Status CopyTensor(const Tensor& src, Tensor& dst) const override;
-
   virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
 
   std::shared_ptr<mkldnn::memory> GetWeightsMemoryBuffer(const std::string& weight_key) {
diff --git a/onnxruntime/core/providers/ngraph/ngraph_execution_provider.cc b/onnxruntime/core/providers/ngraph/ngraph_execution_provider.cc
index 7749a21532..459deae2c8 100644
--- a/onnxruntime/core/providers/ngraph/ngraph_execution_provider.cc
+++ b/onnxruntime/core/providers/ngraph/ngraph_execution_provider.cc
@@ -34,13 +34,13 @@ constexpr const char* NGRAPH = "nGraph";
 NGRAPHExecutionProvider::NGRAPHExecutionProvider(const NGRAPHExecutionProviderInfo& info)
     : IExecutionProvider{onnxruntime::kNGraphExecutionProvider} {
   DeviceAllocatorRegistrationInfo default_allocator_info({OrtMemTypeDefault,
-                                                          [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(NGRAPH, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemTypeDefault)); },
+                                                          [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(NGRAPH, OrtAllocatorType::OrtDeviceAllocator)); },
                                                           std::numeric_limits<size_t>::max()});
 
   InsertAllocator(CreateAllocator(default_allocator_info));
 
   DeviceAllocatorRegistrationInfo cpu_allocator_info({OrtMemTypeCPUOutput,
-                                                      [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(NGRAPH, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemTypeCPUOutput)); },
+                                                      [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(NGRAPH, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput)); },
                                                       std::numeric_limits<size_t>::max()});
 
   InsertAllocator(CreateAllocator(cpu_allocator_info));
@@ -76,24 +76,6 @@ bool TensorCopyPossible(const std::string& src_location, const std::string& dst_
                      });
 }
 
-Status NGRAPHExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst) const {
-  const size_t src_bytes = src.DataType()->Size() * src.Shape().Size();
-  const size_t dst_bytes = dst.DataType()->Size() * dst.Shape().Size();
-  if (src_bytes != dst_bytes) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                           "nGraph: Source and Destination data sizes are not equal - cannot copy tensors");
-  }
-
-  if (!TensorCopyPossible(src.Location().name, dst.Location().name)) {
-    ORT_NOT_IMPLEMENTED("Copying tensors between '", src.Location().name, "' and '", dst.Location().name,
-                        "' is not implemented in NGRAPHExecutionProvider");
-  }
-
-  MEMCPY_S(dst.MutableDataRaw(), src.DataRaw(), dst_bytes, src_bytes);
-
-  return Status::OK();
-}
-
 // Returns true only if op is in a mode that is not currently supported
 static bool IsUnsupportedOpMode(const Node* node, const onnxruntime::GraphViewer& graph_viewer) {
   const auto& optype = node->OpType();
diff --git a/onnxruntime/core/providers/ngraph/ngraph_execution_provider.h b/onnxruntime/core/providers/ngraph/ngraph_execution_provider.h
index a2af812e31..f4081a43a5 100644
--- a/onnxruntime/core/providers/ngraph/ngraph_execution_provider.h
+++ b/onnxruntime/core/providers/ngraph/ngraph_execution_provider.h
@@ -24,8 +24,6 @@ class NGRAPHExecutionProvider : public IExecutionProvider {
   explicit NGRAPHExecutionProvider(const NGRAPHExecutionProviderInfo& info);
   ~NGRAPHExecutionProvider() = default;
 
-  Status CopyTensor(const Tensor& src, Tensor& dst) const override;
-
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                 const std::vector<const KernelRegistry*>& kernel_registries) const override;
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index f86e5bdaff..5deda941ff 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -29,7 +29,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(OpenVINOExecutionProviderIn
     : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} {
   ORT_UNUSED_PARAMETER(info);
 
-  DeviceAllocatorRegistrationInfo device_info({OrtMemTypeDefault, [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(OPENVINO, OrtDeviceAllocator, 0, OrtMemTypeDefault)); }, std::numeric_limits<size_t>::max()});
+  DeviceAllocatorRegistrationInfo device_info({OrtMemTypeDefault, [](int) { return std::make_unique<CPUAllocator>(std::make_unique<OrtAllocatorInfo>(OPENVINO, OrtDeviceAllocator)); }, std::numeric_limits<size_t>::max()});
   InsertAllocator(CreateAllocator(device_info));
 }
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 6b6c27df72..5ec06a1fee 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -47,16 +47,6 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
     return std::make_shared<KernelRegistry>();
   }
 
-  common::Status CopyTensor(const Tensor& src, Tensor& dst) const override {
-    // TODO: Copy for now. May optimize later to avoid copy.
-    size_t bytes = src.DataType()->Size() * src.Shape().Size();
-    const void* src_data = src.DataRaw();
-    void* dst_data = dst.MutableDataRaw();
-    memcpy(dst_data, src_data, bytes);
-
-    return Status::OK();
-  }
-
   const void* GetExecutionHandle() const noexcept override {
     return nullptr;
   }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h b/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h
index f6d70bdde0..ff2e747146 100755
--- a/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h
@@ -12,7 +12,7 @@ class TensorrtPinnedAllocator : public CPUAllocator {
  public:
   virtual const OrtAllocatorInfo& Info() const override {
     static OrtAllocatorInfo tensorrt_cpu_allocator_info(TRT,
-                                                   OrtAllocatorType::OrtDeviceAllocator, 0,
+                                                   OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0,
                                                    OrtMemType::OrtMemTypeCPU);
     return tensorrt_cpu_allocator_info;
   }
@@ -25,8 +25,7 @@ class TensorrtAllocator : public CPUAllocator {
  public:
   virtual const OrtAllocatorInfo& Info() const override {
     static OrtAllocatorInfo tensorrt_default_allocator_info(TRT,
-                                                       OrtAllocatorType::OrtDeviceAllocator, 0,
-                                                       OrtMemType::OrtMemTypeDefault);
+                                                       OrtAllocatorType::OrtDeviceAllocator);
     return tensorrt_default_allocator_info;
   }
 };
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index af0bbdfe66..1ff6839af4 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -280,12 +280,6 @@ TensorrtExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   return result;
 }
 
-common::Status TensorrtExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst) const {
-  ORT_UNUSED_PARAMETER(src);
-  ORT_UNUSED_PARAMETER(dst);
-  return Status::OK();
-}
-
 common::Status TensorrtExecutionProvider::Compile(const std::vector<onnxruntime::Node*>& fused_nodes,
                                                   std::vector<NodeComputeInfo>& node_compute_funcs) {
   for (const auto* fused_node : fused_nodes) {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 723375ce40..7f7b27b60b 100755
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -62,8 +62,6 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   common::Status Compile(const std::vector<onnxruntime::Node*>& fused_nodes,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
 
-  Status CopyTensor(const Tensor& src, Tensor& dst) const override;
-
   void SetMaxBatchSize(const int batch_size) {
     max_batch_size_ = batch_size;
   }
diff --git a/onnxruntime/core/session/IOBinding.h b/onnxruntime/core/session/IOBinding.h
index 1896bd778b..703a1a66e4 100644
--- a/onnxruntime/core/session/IOBinding.h
+++ b/onnxruntime/core/session/IOBinding.h
@@ -44,7 +44,7 @@ class IOBinding {
    * copy it to the desired location. This copy may or may not be async. It depends on the exec provider.
    * If the input ort_value is not at the desired location, it should be preallocated
    * If the input ort_value isn't preallocated, it should have memtype of OrtMemTypeDefault
-   * For copying it leverages IExecutionProvider::CopyTensor().
+   * For copying it leverages DataTransferManager::CopyTensor().
    */
   common::Status BindInput(const std::string& name, const OrtValue& ml_value);
 
diff --git a/onnxruntime/core/session/default_cpu_allocator_c_api.cc b/onnxruntime/core/session/default_cpu_allocator_c_api.cc
index e699f605c5..a4a7e7ea51 100644
--- a/onnxruntime/core/session/default_cpu_allocator_c_api.cc
+++ b/onnxruntime/core/session/default_cpu_allocator_c_api.cc
@@ -17,7 +17,7 @@ struct OrtDefaultAllocator : OrtAllocatorImpl {
     OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<OrtDefaultAllocator*>(this_)->Alloc(size); };
     OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<OrtDefaultAllocator*>(this_)->Free(p); };
     OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const OrtDefaultAllocator*>(this_)->Info(); };
-    ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &cpuAllocatorInfo));
+    ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpuAllocatorInfo));
   }
 
   ~OrtDefaultAllocator() override { OrtReleaseAllocatorInfo(cpuAllocatorInfo); }
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index e245f5f020..26a630efaa 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -44,6 +44,9 @@
 #include "core/optimizer/insert_cast_transformer.h"
 #include "core/optimizer/transformer_memcpy.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
+#ifdef USE_CUDA
+#include "core/providers/cuda/gpu_data_transfer.h"
+#endif
 #include "core/session/IOBinding.h"
 #include "core/session/custom_ops.h"
 #include "core/util/protobuf_parsing_utils.h"
@@ -100,6 +103,13 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, loggin
 
   InitLogger(logging_manager);
 
+  // Register data transfer methods.
+  data_transfer_mgr_.RegisterDataTransfer(std::make_unique<CPUDataTransfer>());
+#ifdef USE_CUDA
+  data_transfer_mgr_.RegisterDataTransfer(std::make_unique<GPUDataTransfer>());
+#endif
+  session_state_.SetDataTransferMgr(&data_transfer_mgr_);
+
   // The threadpool is currently evolving.  We will always create a per session threadpool.
   // Beyond this, we will create a global thread pool to share across sessions.
   {
@@ -396,7 +406,8 @@ common::Status InferenceSession::CreateSubgraphSessionState(Graph& graph, Sessio
       subgraph_session_state->SetLogger(*session_logger_);
       // Pass threadpool to subgraph
       subgraph_session_state->SetThreadPool(session_state.GetThreadPool());
-
+      // Pass data transfer manager to subgraph.
+      subgraph_session_state->SetDataTransferMgr(&session_state.GetDataTransferMgr());
       // Pass fused function manager to subgraph
       subgraph_session_state->GetMutableFuncMgr().SetFusedFuncs(session_state.GetFuncMgr());
 
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index d1a7a57f04..38f7c7eae7 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -420,6 +420,8 @@ class InferenceSession {
 
   // Threadpool for this session
   std::unique_ptr<onnxruntime::concurrency::ThreadPool> thread_pool_;
+  // Data transfer manager.
+  DataTransferManager data_transfer_mgr_;
 
   // Number of concurrently running executors
   std::atomic<int> current_num_runs_;
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index fa37c8784a..2a4cf1c948 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -525,7 +525,7 @@ ORT_API_STATUS_IMPL(OrtTensorProtoToOrtValue, _In_ const void* input, int input_
                     _Out_ OrtValue** out, _Out_ OrtCallback** deleter) {
   API_IMPL_BEGIN
   OrtAllocatorInfo* cpuAllocatorInfo;
-  auto st = OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &cpuAllocatorInfo);
+  auto st = OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpuAllocatorInfo);
   if (st != nullptr) return st;
   ::ONNX_NAMESPACE::TensorProto proto;
   if (!proto.ParseFromArray(input, input_len)) {
diff --git a/onnxruntime/server/executor.cc b/onnxruntime/server/executor.cc
index 74c7c8aaa0..df37799ce0 100644
--- a/onnxruntime/server/executor.cc
+++ b/onnxruntime/server/executor.cc
@@ -62,8 +62,8 @@ protobufutil::Status Executor::SetNameMLValueMap(std::vector<std::string>& input
   auto ort_status = OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &allocator_info);
 
   if (ort_status != nullptr || allocator_info == nullptr) {
-    logger->error("OrtCreateAllocatorInfo failed");
-    return protobufutil::Status(protobufutil::error::Code::RESOURCE_EXHAUSTED, "OrtCreateAllocatorInfo() failed");
+    logger->error("OrtCreateCpuAllocatorInfo failed");
+    return protobufutil::Status(protobufutil::error::Code::RESOURCE_EXHAUSTED, "OrtCreateCpuAllocatorInfo() failed");
   }
 
   // Prepare the Value object
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 65843d7a7b..c555c9ba21 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -196,7 +196,7 @@ class PlannerTest : public ::testing::Test {
   void BindKernel(onnxruntime::Node* p_node, ::onnxruntime::KernelDef& kernel_def) {
     auto info = std::make_unique<OpKernelInfo>(*p_node, kernel_def, *execution_providers_.Get(*p_node),
                                                state_.GetInitializedTensors(), state_.GetOrtValueNameIdxMap(),
-                                               state_.GetFuncMgr());
+                                               state_.GetFuncMgr(), state_.GetDataTransferMgr());
     auto dummy = std::make_unique<DummyOpKernel>(*info);
     op_kernel_infos_.push_back(std::move(info));
     state_.AddKernel(p_node->Index(), std::move(dummy));
diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc
index c0594514f2..694b826267 100644
--- a/onnxruntime/test/framework/allocator_test.cc
+++ b/onnxruntime/test/framework/allocator_test.cc
@@ -49,7 +49,7 @@ class TestAllocator : public IAllocator {
   }
 
   virtual const OrtAllocatorInfo& Info() const override {
-    static OrtAllocatorInfo info("test", OrtDeviceAllocator, 0);
+    static OrtAllocatorInfo info("test", OrtDeviceAllocator);
     return info;
   }
 
diff --git a/onnxruntime/test/framework/dummy_provider.h b/onnxruntime/test/framework/dummy_provider.h
index f7ff420a8d..8ee907a2ca 100644
--- a/onnxruntime/test/framework/dummy_provider.h
+++ b/onnxruntime/test/framework/dummy_provider.h
@@ -18,23 +18,6 @@ class DummyExecutionProvider : public IExecutionProvider {
     InsertAllocator(std::make_unique<DummyAllocator>());
   }
 
-  Status CopyTensor(const Tensor& src, Tensor& dst) const override {
-    // we can 'copy' from anything we allocated to/from CPU
-    ORT_ENFORCE(strcmp(dst.Location().name, DummyAllocator::kDummyAllocator) == 0 ||
-                strcmp(dst.Location().name, CPU) == 0);
-    ORT_ENFORCE(strcmp(src.Location().name, DummyAllocator::kDummyAllocator) == 0 ||
-                strcmp(src.Location().name, CPU) == 0);
-
-    // no really copy needed.
-    const void* src_data = src.DataRaw();
-    void* dst_data = dst.MutableDataRaw();
-
-    // copying between cpu memory
-    memcpy(dst_data, src_data, src.Size());
-
-    return Status::OK();
-  }
-
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
 };
 
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 77cefc9e0c..cfbd405a86 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -14,6 +14,7 @@
 #include "core/common/logging/logging.h"
 #include "core/common/profiler.h"
 #include "core/framework/compute_capability.h"
+#include "core/framework/data_transfer_manager.h"
 #include "core/framework/execution_provider.h"
 #include "core/framework/kernel_registry.h"
 #include "core/framework/op_kernel.h"
@@ -25,6 +26,9 @@
 #include "core/platform/env.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
+#ifdef USE_CUDA
+#include "core/providers/cuda/gpu_data_transfer.h"
+#endif
 #include "core/session/IOBinding.h"
 #include "dummy_provider.h"
 #include "test_utils.h"
@@ -112,12 +116,6 @@ class FuseExecutionProvider : public IExecutionProvider {
     static std::shared_ptr<KernelRegistry> kernel_registry = GetFusedKernelRegistry();
     return kernel_registry;
   }
-
-  common::Status CopyTensor(const Tensor& src, Tensor& dst) const override {
-    ORT_UNUSED_PARAMETER(src);
-    ORT_UNUSED_PARAMETER(dst);
-    return Status::OK();
-  }
 };
 
 namespace test {
@@ -284,7 +282,7 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
     std::unique_ptr<Tensor> cpu_tensor = std::make_unique<Tensor>(element_type,
                                                                   shape,
                                                                   cpu_allocator);
-    st = TestCudaExecutionProvider()->CopyTensor(rtensor, *cpu_tensor.get());
+    st = GPUDataTransfer().CopyTensor(rtensor, *cpu_tensor.get(), 0);
     ASSERT_TRUE(st.IsOK());
     OrtValue ml_value;
     ml_value.Init(cpu_tensor.release(),
diff --git a/onnxruntime/test/framework/op_kernel_test.cc b/onnxruntime/test/framework/op_kernel_test.cc
index 3a6fb5efa9..707495b388 100644
--- a/onnxruntime/test/framework/op_kernel_test.cc
+++ b/onnxruntime/test/framework/op_kernel_test.cc
@@ -21,13 +21,6 @@ namespace test {
 class XPUExecutionProvider : public IExecutionProvider {
  public:
   XPUExecutionProvider() : IExecutionProvider{onnxruntime::kCpuExecutionProvider} {}
-
-  Status CopyTensor(const Tensor& src, Tensor& dst) const override {
-    ORT_UNUSED_PARAMETER(src);
-    ORT_UNUSED_PARAMETER(dst);
-    return Status::OK();
-  }
-
 };
 
 }  // namespace test
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index fe5aa5e5db..b2e422c69e 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -57,7 +57,7 @@ TEST(SessionStateTest, AddGetKernelTest) {
   CPUExecutionProvider execution_provider{CPUExecutionProviderInfo{"CPUExecutionProvider"}};
 
   OpKernelInfo p_info(node, kernel_def, execution_provider, s.GetConstantInitializedTensors(),
-                      s.GetOrtValueNameIdxMap(), s.GetFuncMgr());
+                      s.GetOrtValueNameIdxMap(), s.GetFuncMgr(), s.GetDataTransferMgr());
   unique_ptr<TestOpKernel> p_kernel;
   p_kernel.reset(new TestOpKernel(p_info));
   size_t orig_num_outputs = p_kernel->Node().OutputDefs().size();
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index df59746e20..16b2e409f6 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -92,7 +92,7 @@ OrtValue* CreateTensorWithDataAsOrtValue(OrtAllocatorInfo* info, std::vector<T>&
 template <typename key_type, typename value_type>
 OrtValue* PbMapToOrtValue(const google::protobuf::Map<key_type, value_type>& map) {
   OrtAllocatorInfo* info;
-  ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &info));
+  ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &info));
   std::unique_ptr<OrtAllocatorInfo, decltype(&OrtReleaseAllocatorInfo)> rel_info(info, OrtReleaseAllocatorInfo);
   const size_t ele_count = map.size();
   std::vector<int64_t> dims(1, ele_count);
@@ -122,7 +122,7 @@ OrtValue* PbMapToOrtValue(const google::protobuf::Map<key_type, value_type>& map
 template <typename T>
 void VectorProtoToOrtValue(const RepeatedPtrField<T>& input, ORT_VALUE_HOLDER& output) {
   OrtAllocatorInfo* info;
-  ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &info));
+  ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &info));
   std::unique_ptr<OrtAllocatorInfo, decltype(&OrtReleaseAllocatorInfo)> rel_info(info, OrtReleaseAllocatorInfo);
   OrtValueArray in(input.size());
   size_t j = 0;
diff --git a/onnxruntime/test/shared_lib/test_allocator.cc b/onnxruntime/test/shared_lib/test_allocator.cc
index 3ba04d1065..372594fa8f 100644
--- a/onnxruntime/test/shared_lib/test_allocator.cc
+++ b/onnxruntime/test/shared_lib/test_allocator.cc
@@ -10,7 +10,7 @@ using namespace onnxruntime;
 
 TEST_F(CApiTest, allocation_info) {
   OrtAllocatorInfo *info1, *info2;
-  ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtArenaAllocator, 0, OrtMemTypeDefault, &info1));
+  ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &info1));
   ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &info2));
   int result;
   ORT_THROW_ON_ERROR(OrtCompareAllocatorInfo(info1, info2, &result));
diff --git a/onnxruntime/test/util/test_allocator.cc b/onnxruntime/test/util/test_allocator.cc
index 97f8fd5c4d..3a91d9d52f 100644
--- a/onnxruntime/test/util/test_allocator.cc
+++ b/onnxruntime/test/util/test_allocator.cc
@@ -8,7 +8,7 @@ MockedOrtAllocator::MockedOrtAllocator() {
   OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
   OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
   OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
-  ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &cpuAllocatorInfo));
+  ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpuAllocatorInfo));
 }
 
 MockedOrtAllocator::~MockedOrtAllocator() {