Dump cuda tensor data (#2243)

* dump cuda tensor

* move data_type definition

* Dump cuda tensors for cuda build only.
Output tensor location (if it is not in CPU or pinned)

* update for cuda build

* Update for code review feedback

* update for CR feedback

* use data transfer manager for tensor copy
This commit is contained in:
Tianlei Wu 2019-10-31 21:09:10 -07:00 committed by GitHub
parent 7a5de9c958
commit bc85d43809
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -18,7 +18,6 @@
#include "core/framework/sequential_executor.h"
#include "core/framework/tensorprotoutils.h"
#include "core/mlas/inc/mlas.h"
#include "core/graph/onnx_protobuf.h"
namespace ONNX_NAMESPACE {
@ -610,15 +609,11 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt
std::cout << "-----------\n";
const auto& output_defs = node.OutputDefs();
const auto& execution_providers = session_state.GetExecutionProviders();
const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider);
for (auto i = 0, end = context.OutputCount(); i < end; ++i) {
if (output_defs[i]->Exists()) {
std::cout << "Output " << i << " Name: " << output_defs[i]->Name();
const auto* type = context.OutputType(i);
if (type) {
if (type->IsTensorType()) {
const auto& tensor = *context.Output<Tensor>(i);
@ -629,16 +624,28 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt
if (DEBUG_NODE_INPUTS_OUTPUTS > 1) {
// check tensor is on CPU before dumping it
auto& tensor_location = tensor.Location();
auto* provider = execution_providers.Get(tensor_location);
if (!provider) {
provider = cpu_execution_provider;
}
if (provider == cpu_execution_provider || tensor_location.mem_type == OrtMemTypeCPUOutput) {
const auto data_type = tensor.DataType();
const auto data_type = tensor.DataType();
if (tensor_location.device.Type() == OrtDevice::CPU || tensor_location.mem_type == OrtMemTypeCPUOutput) {
DispatchOnTensorType(data_type, DumpTensor, tensor, shape);
} else {
std::cout << " is not on CPU. Provider=" << provider->Type() << "\n";
std::cout << tensor_location << "\n";
#ifdef USE_CUDA
// Dumping GPU only when cuda is enabled. Most op has only one output, so put GPU related code here to get best performance.
if (tensor_location.device.Type() == OrtDevice::GPU) {
const auto& execution_providers = session_state.GetExecutionProviders();
const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider);
auto cpu_allocator = cpu_execution_provider->GetAllocator(0, OrtMemTypeDefault);
std::unique_ptr<Tensor> cpu_tensor = onnxruntime::make_unique<Tensor>(data_type, shape, cpu_allocator);
const auto& data_transfer_mgr = session_state.GetDataTransferMgr();
auto status = data_transfer_mgr.CopyTensor(tensor, *cpu_tensor.get(), 0);
if (status == common::Status::OK()) {
DispatchOnTensorType(data_type, DumpTensor, *cpu_tensor.get(), shape);
} else {
std::cout << " failed to transfer data to cpu.\n";
}
}
#endif
}
}
} else {