Dump cuda tensor data (#2243)

* dump cuda tensor * move data_type definition * Dump cuda tensors for cuda build only. Output tensor location (if it is not in CPU or pinned) * update for cuda build * Update for code review feedback * update for CR feedback * use data transfer manager for tensor copy
2026-06-27 03:11:28 +00:00 · 2019-10-31 21:09:10 -07:00 · 2019-10-31 21:09:10 -07:00 · bc85d43809
commit bc85d43809
parent 7a5de9c958
1 changed files with 20 additions and 13 deletions
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@ -18,7 +18,6 @@
 #include "core/framework/sequential_executor.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/mlas/inc/mlas.h"
-
 #include "core/graph/onnx_protobuf.h"

 namespace ONNX_NAMESPACE {
@ -610,15 +609,11 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt
  std::cout << "-----------\n";
  const auto& output_defs = node.OutputDefs();

-  const auto& execution_providers = session_state.GetExecutionProviders();
-  const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider);
-
  for (auto i = 0, end = context.OutputCount(); i < end; ++i) {
    if (output_defs[i]->Exists()) {
      std::cout << "Output " << i << " Name: " << output_defs[i]->Name();

      const auto* type = context.OutputType(i);
-
      if (type) {
        if (type->IsTensorType()) {
          const auto& tensor = *context.Output<Tensor>(i);
@ -629,16 +624,28 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt
          if (DEBUG_NODE_INPUTS_OUTPUTS > 1) {
            // check tensor is on CPU before dumping it
            auto& tensor_location = tensor.Location();
-            auto* provider = execution_providers.Get(tensor_location);
-            if (!provider) {
-              provider = cpu_execution_provider;
-            }
-
-            if (provider == cpu_execution_provider || tensor_location.mem_type == OrtMemTypeCPUOutput) {
-              const auto data_type = tensor.DataType();
+            const auto data_type = tensor.DataType();
+            if (tensor_location.device.Type() == OrtDevice::CPU || tensor_location.mem_type == OrtMemTypeCPUOutput) {
              DispatchOnTensorType(data_type, DumpTensor, tensor, shape);
            } else {
-              std::cout << " is not on CPU. Provider=" << provider->Type() << "\n";
+              std::cout << tensor_location << "\n";
+
+#ifdef USE_CUDA
+              // Dumping GPU only when cuda is enabled. Most op has only one output, so put GPU related code here to get best performance.
+              if (tensor_location.device.Type() == OrtDevice::GPU) {
+                const auto& execution_providers = session_state.GetExecutionProviders();
+                const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider);
+                auto cpu_allocator = cpu_execution_provider->GetAllocator(0, OrtMemTypeDefault);
+                std::unique_ptr<Tensor> cpu_tensor = onnxruntime::make_unique<Tensor>(data_type, shape, cpu_allocator);
+                const auto& data_transfer_mgr = session_state.GetDataTransferMgr();
+                auto status = data_transfer_mgr.CopyTensor(tensor, *cpu_tensor.get(), 0);
+                if (status == common::Status::OK()) {
+                  DispatchOnTensorType(data_type, DumpTensor, *cpu_tensor.get(), shape);
+                } else {
+                  std::cout << " failed to transfer data to cpu.\n";
+                }
+              }
+#endif
            }
          }
        } else {