diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 85d7dc8aa1..545559d5bd 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -18,7 +18,6 @@
 #include "core/framework/sequential_executor.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/mlas/inc/mlas.h"
-
 #include "core/graph/onnx_protobuf.h"
 
 namespace ONNX_NAMESPACE {
@@ -610,15 +609,11 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt
   std::cout << "-----------\n";
   const auto& output_defs = node.OutputDefs();
 
-  const auto& execution_providers = session_state.GetExecutionProviders();
-  const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider);
-
   for (auto i = 0, end = context.OutputCount(); i < end; ++i) {
     if (output_defs[i]->Exists()) {
       std::cout << "Output " << i << " Name: " << output_defs[i]->Name();
 
       const auto* type = context.OutputType(i);
-
       if (type) {
         if (type->IsTensorType()) {
           const auto& tensor = *context.Output<Tensor>(i);
@@ -629,16 +624,28 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt
           if (DEBUG_NODE_INPUTS_OUTPUTS > 1) {
             // check tensor is on CPU before dumping it
             auto& tensor_location = tensor.Location();
-            auto* provider = execution_providers.Get(tensor_location);
-            if (!provider) {
-              provider = cpu_execution_provider;
-            }
-
-            if (provider == cpu_execution_provider || tensor_location.mem_type == OrtMemTypeCPUOutput) {
-              const auto data_type = tensor.DataType();
+            const auto data_type = tensor.DataType();
+            if (tensor_location.device.Type() == OrtDevice::CPU || tensor_location.mem_type == OrtMemTypeCPUOutput) {
               DispatchOnTensorType(data_type, DumpTensor, tensor, shape);
             } else {
-              std::cout << " is not on CPU. Provider=" << provider->Type() << "\n";
+              std::cout << tensor_location << "\n";
+
+#ifdef USE_CUDA
+              // Dumping GPU only when cuda is enabled. Most op has only one output, so put GPU related code here to get best performance.
+              if (tensor_location.device.Type() == OrtDevice::GPU) {
+                const auto& execution_providers = session_state.GetExecutionProviders();
+                const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider);
+                auto cpu_allocator = cpu_execution_provider->GetAllocator(0, OrtMemTypeDefault);
+                std::unique_ptr<Tensor> cpu_tensor = onnxruntime::make_unique<Tensor>(data_type, shape, cpu_allocator);
+                const auto& data_transfer_mgr = session_state.GetDataTransferMgr();
+                auto status = data_transfer_mgr.CopyTensor(tensor, *cpu_tensor.get(), 0);
+                if (status == common::Status::OK()) {
+                  DispatchOnTensorType(data_type, DumpTensor, *cpu_tensor.get(), shape);
+                } else {
+                  std::cout << " failed to transfer data to cpu.\n";
+                }
+              }
+#endif
             }
           }
         } else {