diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index 85d7dc8aa1..545559d5bd 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -18,7 +18,6 @@ #include "core/framework/sequential_executor.h" #include "core/framework/tensorprotoutils.h" #include "core/mlas/inc/mlas.h" - #include "core/graph/onnx_protobuf.h" namespace ONNX_NAMESPACE { @@ -610,15 +609,11 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt std::cout << "-----------\n"; const auto& output_defs = node.OutputDefs(); - const auto& execution_providers = session_state.GetExecutionProviders(); - const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider); - for (auto i = 0, end = context.OutputCount(); i < end; ++i) { if (output_defs[i]->Exists()) { std::cout << "Output " << i << " Name: " << output_defs[i]->Name(); const auto* type = context.OutputType(i); - if (type) { if (type->IsTensorType()) { const auto& tensor = *context.Output(i); @@ -629,16 +624,28 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt if (DEBUG_NODE_INPUTS_OUTPUTS > 1) { // check tensor is on CPU before dumping it auto& tensor_location = tensor.Location(); - auto* provider = execution_providers.Get(tensor_location); - if (!provider) { - provider = cpu_execution_provider; - } - - if (provider == cpu_execution_provider || tensor_location.mem_type == OrtMemTypeCPUOutput) { - const auto data_type = tensor.DataType(); + const auto data_type = tensor.DataType(); + if (tensor_location.device.Type() == OrtDevice::CPU || tensor_location.mem_type == OrtMemTypeCPUOutput) { DispatchOnTensorType(data_type, DumpTensor, tensor, shape); } else { - std::cout << " is not on CPU. Provider=" << provider->Type() << "\n"; + std::cout << tensor_location << "\n"; + +#ifdef USE_CUDA + // Dumping GPU only when cuda is enabled. Most op has only one output, so put GPU related code here to get best performance. + if (tensor_location.device.Type() == OrtDevice::GPU) { + const auto& execution_providers = session_state.GetExecutionProviders(); + const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider); + auto cpu_allocator = cpu_execution_provider->GetAllocator(0, OrtMemTypeDefault); + std::unique_ptr cpu_tensor = onnxruntime::make_unique(data_type, shape, cpu_allocator); + const auto& data_transfer_mgr = session_state.GetDataTransferMgr(); + auto status = data_transfer_mgr.CopyTensor(tensor, *cpu_tensor.get(), 0); + if (status == common::Status::OK()) { + DispatchOnTensorType(data_type, DumpTensor, *cpu_tensor.get(), shape); + } else { + std::cout << " failed to transfer data to cpu.\n"; + } + } +#endif } } } else {