mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-27 03:11:28 +00:00
Dump cuda tensor data (#2243)
* dump cuda tensor * move data_type definition * Dump cuda tensors for cuda build only. Output tensor location (if it is not in CPU or pinned) * update for cuda build * Update for code review feedback * update for CR feedback * use data transfer manager for tensor copy
This commit is contained in:
parent
7a5de9c958
commit
bc85d43809
1 changed files with 20 additions and 13 deletions
|
|
@ -18,7 +18,6 @@
|
|||
#include "core/framework/sequential_executor.h"
|
||||
#include "core/framework/tensorprotoutils.h"
|
||||
#include "core/mlas/inc/mlas.h"
|
||||
|
||||
#include "core/graph/onnx_protobuf.h"
|
||||
|
||||
namespace ONNX_NAMESPACE {
|
||||
|
|
@ -610,15 +609,11 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt
|
|||
std::cout << "-----------\n";
|
||||
const auto& output_defs = node.OutputDefs();
|
||||
|
||||
const auto& execution_providers = session_state.GetExecutionProviders();
|
||||
const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider);
|
||||
|
||||
for (auto i = 0, end = context.OutputCount(); i < end; ++i) {
|
||||
if (output_defs[i]->Exists()) {
|
||||
std::cout << "Output " << i << " Name: " << output_defs[i]->Name();
|
||||
|
||||
const auto* type = context.OutputType(i);
|
||||
|
||||
if (type) {
|
||||
if (type->IsTensorType()) {
|
||||
const auto& tensor = *context.Output<Tensor>(i);
|
||||
|
|
@ -629,16 +624,28 @@ void DumpNodeOutputs(OpKernelContext& context, const Node& node, const SessionSt
|
|||
if (DEBUG_NODE_INPUTS_OUTPUTS > 1) {
|
||||
// check tensor is on CPU before dumping it
|
||||
auto& tensor_location = tensor.Location();
|
||||
auto* provider = execution_providers.Get(tensor_location);
|
||||
if (!provider) {
|
||||
provider = cpu_execution_provider;
|
||||
}
|
||||
|
||||
if (provider == cpu_execution_provider || tensor_location.mem_type == OrtMemTypeCPUOutput) {
|
||||
const auto data_type = tensor.DataType();
|
||||
const auto data_type = tensor.DataType();
|
||||
if (tensor_location.device.Type() == OrtDevice::CPU || tensor_location.mem_type == OrtMemTypeCPUOutput) {
|
||||
DispatchOnTensorType(data_type, DumpTensor, tensor, shape);
|
||||
} else {
|
||||
std::cout << " is not on CPU. Provider=" << provider->Type() << "\n";
|
||||
std::cout << tensor_location << "\n";
|
||||
|
||||
#ifdef USE_CUDA
|
||||
// Dumping GPU only when cuda is enabled. Most op has only one output, so put GPU related code here to get best performance.
|
||||
if (tensor_location.device.Type() == OrtDevice::GPU) {
|
||||
const auto& execution_providers = session_state.GetExecutionProviders();
|
||||
const auto* cpu_execution_provider = execution_providers.Get(onnxruntime::kCpuExecutionProvider);
|
||||
auto cpu_allocator = cpu_execution_provider->GetAllocator(0, OrtMemTypeDefault);
|
||||
std::unique_ptr<Tensor> cpu_tensor = onnxruntime::make_unique<Tensor>(data_type, shape, cpu_allocator);
|
||||
const auto& data_transfer_mgr = session_state.GetDataTransferMgr();
|
||||
auto status = data_transfer_mgr.CopyTensor(tensor, *cpu_tensor.get(), 0);
|
||||
if (status == common::Status::OK()) {
|
||||
DispatchOnTensorType(data_type, DumpTensor, *cpu_tensor.get(), shape);
|
||||
} else {
|
||||
std::cout << " failed to transfer data to cpu.\n";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in a new issue