Extend node debugging utilities to push tensors and node placement to SQL database (#8672)

* adding support for tracing to sqldb instead of files

* use compiled statements

* script to pull tensors from db

* link sqlite3

* remove node info redundant with onnx graph

* addressing PR comments

* address PR comments and include program counter

* third party notice

* use find_pacakge

* add to cgmanifests.json

* address thread safety and add pid suffix

* build fi

* python script to select on devicetype

* remove unpopulated and redundant Shape and Type fields

* comment

* comment

* PR comments

* add graph execution counter to session state

* move increment to inference session

* std::endl to \n

* ifdef on graph execution counter

* add ifdef to inference session

* move DEBUG_NODE_INPUTS_OUTPUTS to CMakeLists.txt
This commit is contained in:
Suffian Khan 2021-08-21 00:40:12 -07:00 committed by GitHub
parent 4666a49106
commit 9fa0d8392a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 463 additions and 37 deletions

View file

@ -4747,3 +4747,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
_____
SQLite Is Public Domain
All of the code and documentation in SQLite has been dedicated to the public
domain by the authors. All code authors, and representatives of the companies
they work for, have signed affidavits dedicating their contributions to the
public domain and originals of those signed affidavits are stored in a firesafe
at the main offices of Hwaci. Anyone is free to copy, modify, publish, use,
compile, sell, or distribute the original SQLite code, either in source code
form or as a compiled binary, for any purpose, commercial or non-commercial,
and by any means.
The previous paragraph applies to the deliverable code and documentation in
SQLite - those parts of the SQLite library that you actually bundle and ship
with a larger application. Some scripts used as part of the build process (for
example the "configure" scripts generated by autoconf) might fall under other
open-source licenses. Nothing from these build scripts ever reaches the final
deliverable SQLite library, however, and so the licenses associated with those
scripts should not be a factor in assessing your rights to copy and use the
SQLite library.
All of the deliverable code in SQLite has been written from scratch. No code
has been taken from other projects or from the open internet. Every line of
code can be traced back to its original author, and all of those authors have
public domain dedications on file. So the SQLite code base is clean and is
uncontaminated with licensed code from other projects.

View file

@ -469,7 +469,17 @@
},
"comments": "dlpack"
}
},
{
"component": {
"Type": "other",
"Other": {
"Name": "SQLite3",
"Version": "3.22.0",
"DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
}
}
}
],
"Version": 1
}
}

View file

@ -83,6 +83,7 @@ cmake_dependent_option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instea
option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
option(onnxruntime_USE_DML "Build with DirectML support" OFF)
option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
option(onnxruntime_USE_WINML "Build with WinML support" OFF)
@ -1835,3 +1836,8 @@ if (onnxruntime_BUILD_OPSCHEMA_LIB AND onnxruntime_ENABLE_TRAINING)
# opschema library requires training ops as well
include(onnxruntime_opschema_lib.cmake)
endif()
if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS)
add_compile_definitions(DEBUG_NODE_INPUTS_OUTPUTS)
endif()

View file

@ -75,9 +75,15 @@ if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_BUI
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
endif()
if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS)
target_compile_definitions(onnxruntime_framework PRIVATE DEBUG_NODE_INPUTS_OUTPUTS)
if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB)
find_package (SQLite3)
if (SQLITE3_FOUND)
include_directories(${SQLite3_INCLUDE_DIR})
target_link_libraries (onnxruntime_framework ${SQLite3_LIBRARY})
else()
message( FATAL_ERROR "Could not locate SQLite3 package." )
endif (SQLITE3_FOUND)
target_compile_definitions(onnxruntime_framework PRIVATE DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB)
endif()
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)

View file

@ -40,6 +40,7 @@ endif()
if (onnxruntime_ENABLE_TRAINING OR onnxruntime_ENABLE_TRAINING_OPS)
target_include_directories(onnxruntime_session PRIVATE ${ORTTRAINING_ROOT})
endif()
if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
onnxruntime_add_include_to_target(onnxruntime_session Python::Module)
endif()

View file

@ -7,6 +7,11 @@
#include <iomanip>
#include <cctype>
#include <string>
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
#include <sqlite3.h>
#endif
#include "core/common/path_utils.h"
#include "core/framework/tensorprotoutils.h"
@ -18,6 +23,15 @@ namespace utils {
namespace {
struct TensorMetadata {
std::string name;
std::string producer;
std::string consumer;
std::string device_type;
size_t step;
};
bool FilterNode(const NodeDumpOptions& dump_options, const Node& node) {
auto match_pattern =
[](const std::string& value, const std::string& delimited_patterns) {
@ -117,17 +131,230 @@ void DumpTensorToFile(const Tensor& tensor, const std::string& tensor_name, cons
ORT_THROW_IF_ERROR(Env::Default().FileClose(output_fd));
}
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
sqlite3* SqliteConnection() {
static thread_local std::unique_ptr<sqlite3, decltype(&sqlite3_close)> sqlite_db(
[]() {
std::stringstream ss;
ss << "-pid" << Env::Default().GetSelfPid() << ".db";
const auto& opt = NodeDumpOptionsFromEnvironmentVariables();
auto sqlite_db_prefix = opt.sqlite_db_prefix;
auto sqlite_db_path = sqlite_db_prefix.Concat(ss.str()).ToPathString();
sqlite3 *db;
int rc = sqlite3_open(sqlite_db_path.c_str(), &db);
ORT_ENFORCE(rc == SQLITE_OK, "Failed to connect to sqlite3 db ", sqlite_db_path.c_str());
const char *sql_create_tensor_table =
"Create table if not exists Tensors ( "
" Step int not null, "
" Name text not null, "
" Value TensorProto, "
" DeviceType text, "
" TracedProducer NodeArg, "
" TracedConsumers NodeArgList, "
" primary key (Step, Name) "
");";
const char *error_message = nullptr;
rc = sqlite3_exec(db, sql_create_tensor_table, nullptr, 0, (char**)&error_message);
ORT_ENFORCE(rc == SQLITE_OK,
"Failed to create Tensors table in sqlite3 db ", sqlite_db_path.c_str(),
" on ", error_message);
const char *sql_create_node_table =
"Create table if not exists Nodes ( "
" ExecutionCounter int, "
" Name text primary key not null, "
" OpType text not null, "
" ExecutionProvider text "
");";
rc = sqlite3_exec(db, sql_create_node_table, nullptr, 0, (char**)&error_message);
ORT_ENFORCE(rc == SQLITE_OK,
"Failed to create Nodes table in sqlite3 db ", sqlite_db_path.c_str(),
" on ", error_message);
return db;
}(), &sqlite3_close);
return sqlite_db.get();
}
#define SQL_OK(command) \
ORT_ENFORCE((command) == SQLITE_OK, "Failed sql operation on ", sqlite3_errmsg(SqliteConnection()))
void SqlStepWithRetry(sqlite3_stmt* stmt, int sql_expected) {
int attempt = 0;
while (true) {
int rc = sqlite3_step(stmt);
if (rc == sql_expected) {
return;
}
if (rc == SQLITE_BUSY || rc == SQLITE_LOCKED) {
if (attempt % 10000 == 0) {
std::cerr << "Warning: Pid " << Env::Default().GetSelfPid()
<< " gently spinning on sql db busy or locked\n";
}
Env::Default().SleepForMicroseconds(100);
attempt++;
continue;
}
ORT_THROW("Failed sql step for ", sqlite3_expanded_sql(stmt), " on ", sqlite3_errmsg(SqliteConnection()));
}
}
bool TensorExistsInSqlDb(const TensorMetadata& tensor_metadata) {
static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
sqlite3 *db = SqliteConnection();
const char *sql_tensor_exists =
"select count(name) from Tensors where Name == ? and Step == ?;";
sqlite3_stmt *stmt = nullptr;
SQL_OK(sqlite3_prepare_v2(db, sql_tensor_exists, -1, &stmt, nullptr));
return stmt;
}(), &sqlite3_finalize);
sqlite3_stmt* stmt = stmt_uptr.get();
SQL_OK(sqlite3_reset(stmt));
SQL_OK(sqlite3_bind_text(stmt, 1, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
SQL_OK(sqlite3_bind_int(stmt, 2, (int)tensor_metadata.step));
SqlStepWithRetry(stmt, SQLITE_ROW);
bool exists = sqlite3_column_int(stmt, 0) > 0;
SqlStepWithRetry(stmt, SQLITE_DONE);
return exists;
}
void InsertTensorInSqlDb(const Tensor& tensor, const TensorMetadata& tensor_metadata) {
static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
sqlite3 *db = SqliteConnection();
const char *sql_insert_tensor =
"Insert into Tensors (Step, Name, Value, DeviceType, TracedProducer, TracedConsumers) "
" values (?, ?, ?, ?, \"\", \"\"); ";
sqlite3_stmt *stmt = nullptr;
SQL_OK(sqlite3_prepare_v2(db, sql_insert_tensor, -1, &stmt, nullptr));
return stmt;
}(), &sqlite3_finalize);
sqlite3_stmt* stmt = stmt_uptr.get();
SQL_OK(sqlite3_reset(stmt));
SQL_OK(sqlite3_bind_int(stmt, 1, tensor_metadata.step));
SQL_OK(sqlite3_bind_text(stmt, 2, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
auto tensor_proto = utils::TensorToTensorProto(tensor, tensor_metadata.name);
std::string bytes = tensor_proto.SerializeAsString();
const char* data = bytes.data();
int size = bytes.size();
SQL_OK(sqlite3_bind_blob(stmt, 3, data, size, SQLITE_TRANSIENT));
SQL_OK(sqlite3_bind_text(stmt, 4, tensor_metadata.device_type.c_str(), -1, SQLITE_TRANSIENT));
SqlStepWithRetry(stmt, SQLITE_DONE);
}
void UpdateTensorUsageInSqlDb(const TensorMetadata& tensor_metadata) {
static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
sqlite3 *db = SqliteConnection();
const char *sql_update_tensor =
"Update Tensors set "
" TracedProducer = TracedProducer || ?, "
" TracedConsumers = TracedConsumers || ? "
"where Name = ? and Step = ?;";
sqlite3_stmt *stmt = nullptr;
SQL_OK(sqlite3_prepare_v2(db, sql_update_tensor, -1, &stmt, nullptr));
return stmt;
}(), &sqlite3_finalize);
sqlite3_stmt* stmt = stmt_uptr.get();
SQL_OK(sqlite3_reset(stmt));
SQL_OK(sqlite3_bind_text(stmt, 1, tensor_metadata.producer.c_str(), -1, SQLITE_TRANSIENT));
SQL_OK(sqlite3_bind_text(stmt, 2, tensor_metadata.consumer.c_str(), -1, SQLITE_TRANSIENT));
SQL_OK(sqlite3_bind_text(stmt, 3, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
SQL_OK(sqlite3_bind_int(stmt, 4, tensor_metadata.step));
SqlStepWithRetry(stmt, SQLITE_DONE);
}
void DumpTensorToSqliteDb(const Tensor& tensor, const TensorMetadata& tensor_metadata) {
if (!TensorExistsInSqlDb(tensor_metadata)) {
InsertTensorInSqlDb(tensor, tensor_metadata);
}
UpdateTensorUsageInSqlDb(tensor_metadata);
}
void InsertNodePlacementToSqliteDb(const NodeDumpContext& dump_context, const Node& node) {
static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
sqlite3 *db = SqliteConnection();
const char *sql_insert_node =
"Insert or Ignore into Nodes (ExecutionCounter, Name, OpType, ExecutionProvider) "
" values (?, ?, ?, ?);";
sqlite3_stmt *stmt = nullptr;
SQL_OK(sqlite3_prepare_v2(db, sql_insert_node, -1, &stmt, nullptr));
return stmt;
}(), &sqlite3_finalize);
sqlite3_stmt* stmt = stmt_uptr.get();
SQL_OK(sqlite3_reset(stmt));
SQL_OK(sqlite3_bind_int(stmt, 1, dump_context.program_counter));
SQL_OK(sqlite3_bind_text(stmt, 2, node.Name().c_str(), -1, SQLITE_TRANSIENT));
SQL_OK(sqlite3_bind_text(stmt, 3, node.OpType().c_str(), -1, SQLITE_TRANSIENT));
SQL_OK(sqlite3_bind_text(stmt, 4, node.GetExecutionProviderType().c_str(), -1, SQLITE_TRANSIENT));
SqlStepWithRetry(stmt, SQLITE_DONE);
}
#endif // DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
void DumpCpuTensor(
const NodeDumpOptions& dump_options,
const Tensor& tensor, const std::string& tensor_name) {
const Tensor& tensor, const TensorMetadata& tensor_metadata) {
switch (dump_options.data_destination) {
case NodeDumpOptions::DataDestination::StdOut: {
DispatchOnTensorType(tensor.DataType(), DumpTensorToStdOut, tensor);
break;
}
case NodeDumpOptions::DataDestination::TensorProtoFiles: {
const Path tensor_file = dump_options.output_dir / Path::Parse(MakeTensorFileName(tensor_name, dump_options));
DumpTensorToFile(tensor, tensor_name, tensor_file);
const Path tensor_file = dump_options.output_dir / Path::Parse(MakeTensorFileName(tensor_metadata.name, dump_options));
DumpTensorToFile(tensor, tensor_metadata.name, tensor_file);
break;
}
case NodeDumpOptions::DataDestination::SqliteDb: {
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
DumpTensorToSqliteDb(tensor, tensor_metadata);
#else
ORT_THROW("Recompile with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1 onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB=1");
#endif
break;
}
default:
@ -137,14 +364,15 @@ void DumpCpuTensor(
void DumpTensor(
const NodeDumpOptions& dump_options,
const Tensor& tensor, const std::string& tensor_name,
const Tensor& tensor, TensorMetadata& tensor_metadata,
const SessionState& session_state) {
// check tensor is on CPU before dumping it
auto& tensor_location = tensor.Location();
if (tensor_location.device.Type() == OrtDevice::CPU ||
tensor_location.mem_type == OrtMemTypeCPUInput ||
tensor_location.mem_type == OrtMemTypeCPUOutput) {
DumpCpuTensor(dump_options, tensor, tensor_name);
tensor_metadata.device_type = "CPU";
DumpCpuTensor(dump_options, tensor, tensor_metadata);
} else {
std::cout << tensor_location << "\n";
@ -159,7 +387,8 @@ void DumpTensor(
const auto& data_transfer_mgr = session_state.GetDataTransferMgr();
auto status = data_transfer_mgr.CopyTensor(tensor, cpu_tensor);
if (status == common::Status::OK()) {
DumpCpuTensor(dump_options, cpu_tensor, tensor_name);
tensor_metadata.device_type = "GPU";
DumpCpuTensor(dump_options, cpu_tensor, tensor_metadata);
} else {
std::cout << " failed to transfer data to cpu.\n";
}
@ -191,13 +420,25 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables() {
if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpOutputData, false)) {
opts.dump_flags |= NodeDumpOptions::DumpFlags::OutputData;
}
if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpNodePlacement, true)) {
opts.dump_flags |= NodeDumpOptions::DumpFlags::NodePlacement;
}
opts.filter.name_pattern = Env::Default().GetEnvironmentVar(env_vars::kNameFilter);
opts.filter.op_type_pattern = Env::Default().GetEnvironmentVar(env_vars::kOpTypeFilter);
if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpDataToFiles, false)) {
const std::string destination = ParseEnvironmentVariableWithDefault<std::string>(
env_vars::kDumpDataDestination, "stdout");
if (destination == "files") {
opts.data_destination = NodeDumpOptions::DataDestination::TensorProtoFiles;
}
else if (destination == "sqlite") {
opts.data_destination = NodeDumpOptions::DataDestination::SqliteDb;
}
else if (destination != "stdout") {
ORT_THROW("Unsupported data destination type: ", destination);
}
if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kAppendRankToFileName, false)) {
std::string rank = Env::Default().GetEnvironmentVar("OMPI_COMM_WORLD_RANK");
@ -210,6 +451,10 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables() {
opts.output_dir = Path::Parse(ToPathString(Env::Default().GetEnvironmentVar(env_vars::kOutputDir)));
std::string sqlite_db_prefix =
ParseEnvironmentVariableWithDefault<std::string>(env_vars::kSqliteDbPrefix, "execution-trace");
opts.sqlite_db_prefix = Path::Parse(ToPathString(sqlite_db_prefix));
// check for confirmation for dumping data to files for all nodes
const bool is_input_or_output_requested = ((opts.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) ||
((opts.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0);
@ -241,8 +486,11 @@ static void PrintIf(bool boolean_expression, const std::string& message) {
}
void DumpNodeInputs(
const NodeDumpOptions& dump_options,
const OpKernelContext& context, const Node& node, const SessionState& session_state) {
const NodeDumpOptions& dump_options,
const NodeDumpContext& dump_context,
const OpKernelContext& context,
const Node& node,
const SessionState& session_state) {
const bool is_any_output_dumped = IsAnyOutputDumped(dump_options);
if (!is_any_output_dumped) {
return;
@ -250,14 +498,23 @@ void DumpNodeInputs(
if (!FilterNode(dump_options, node)) return;
bool should_dump_node_placement = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::NodePlacement) != 0;
if (dump_context.iteration == 1 && should_dump_node_placement) {
PrintIf(should_dump_node_placement, MakeString(" Placement: ", node.GetExecutionProviderType(), "\n"));
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
InsertNodePlacementToSqliteDb(dump_context, node);
#endif
}
std::cout << "-----------\n";
std::cout << node.OpType() << " node: " << node.Name() << "\n";
const auto& input_defs = node.InputDefs();
TensorMetadata tensor_metadata;
for (auto i = 0, end = context.InputCount(); i < end; ++i) {
if (input_defs[i]->Exists()) {
std::cout << "Input " << i << " Name: " << input_defs[i]->Name();
std::cout << "Input " << i << " Name: " << input_defs[i]->Name() << "\n";
const auto* type = context.InputType(i);
@ -270,7 +527,10 @@ void DumpNodeInputs(
PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) {
DumpTensor(dump_options, tensor, input_defs[i]->Name(), session_state);
tensor_metadata.name = input_defs[i]->Name();
tensor_metadata.step = dump_context.iteration;
tensor_metadata.consumer = node.Name() + ":" + std::to_string(i);
DumpTensor(dump_options, tensor, tensor_metadata, session_state);
}
} else {
std::cout << " is non-tensor type.\n";
@ -286,11 +546,19 @@ void DumpNodeInputs(
}
void DumpNodeInputs(
const OpKernelContext& context, const Node& node, const SessionState& session_state) {
DumpNodeInputs(NodeDumpOptionsFromEnvironmentVariables(), context, node, session_state);
const NodeDumpContext& dump_context,
const OpKernelContext& context,
const Node& node,
const SessionState& session_state) {
DumpNodeInputs(NodeDumpOptionsFromEnvironmentVariables(), dump_context, context, node, session_state);
}
void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& context, const Node& node, const SessionState& session_state) {
void DumpNodeOutputs(
const NodeDumpOptions& dump_options,
const NodeDumpContext& dump_context,
OpKernelContext& context,
const Node& node,
const SessionState& session_state) {
const bool is_any_output_dumped = IsAnyOutputDumped(dump_options);
if (!is_any_output_dumped) {
return;
@ -298,12 +566,21 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
if (!FilterNode(dump_options, node)) return;
bool should_dump_node_placement = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::NodePlacement) != 0;
if (dump_context.iteration == 1 && should_dump_node_placement) {
PrintIf(should_dump_node_placement, MakeString(" Placement: ", node.GetExecutionProviderType(), "\n"));
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
InsertNodePlacementToSqliteDb(dump_context, node);
#endif
}
std::cout << "-----------\n";
const auto& output_defs = node.OutputDefs();
TensorMetadata tensor_metadata;
for (auto i = 0, end = context.OutputCount(); i < end; ++i) {
if (output_defs[i]->Exists()) {
std::cout << "Output " << i << " Name: " << output_defs[i]->Name();
std::cout << "Output " << i << " Name: " << output_defs[i]->Name() << "\n";
const auto* type = context.OutputType(i);
if (type) {
@ -315,7 +592,10 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0) {
DumpTensor(dump_options, tensor, output_defs[i]->Name(), session_state);
tensor_metadata.name = output_defs[i]->Name();
tensor_metadata.step = dump_context.iteration;
tensor_metadata.producer = node.Name() + ":" + std::to_string(i);
DumpTensor(dump_options, tensor, tensor_metadata, session_state);
}
} else {
std::cout << " is non-tensor type.\n";
@ -333,8 +613,11 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
}
void DumpNodeOutputs(
OpKernelContext& context, const Node& node, const SessionState& session_state) {
DumpNodeOutputs(NodeDumpOptionsFromEnvironmentVariables(), context, node, session_state);
const NodeDumpContext& dump_context,
OpKernelContext& context,
const Node& node,
const SessionState& session_state) {
DumpNodeOutputs(NodeDumpOptionsFromEnvironmentVariables(), dump_context, context, node, session_state);
}
} // namespace utils

View file

@ -4,6 +4,14 @@
// to create a build with these enabled run the build script with:
// --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
// to enable redirect to sqlite database run the build script with:
// --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1 onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB=1
//
// see orttraining/tools/scripts/sqldb_to_tensors.py for retrieval
//
// select data dump destination using
// ORT_DEBUG_NODE_IO_DUMP_DATA_DESTINATION= one of {stdout, files, sqlite}
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
#pragma once
@ -18,10 +26,12 @@ namespace utils {
// environment variables that control debug node dumping behavior
namespace debug_node_inputs_outputs_env_vars {
// Shape is printed by default unless it's turned OFF by setting environment
// variable ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA to 0.
// set to non-zero to dump shape data
// Tensor shape and Node placement is printed by default unless it's turned OFF
// by setting the respective environment variables to 0
// set to non-zero to dump tensor shape data
constexpr const char* kDumpShapeData = "ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA";
// set to non-zero to dump node placement data
constexpr const char* kDumpNodePlacement = "ORT_DEBUG_NODE_IO_DUMP_NODE_PLACEMENT";
// set to non-zero to dump node input data
constexpr const char* kDumpInputData = "ORT_DEBUG_NODE_IO_DUMP_INPUT_DATA";
// set to non-zero to dump node output data
@ -32,12 +42,14 @@ constexpr const char* kNameFilter = "ORT_DEBUG_NODE_IO_NAME_FILTER";
// specify a node op type filter to limit the nodes that are dumped
// see NodeDumpOptions::FilterOptions
constexpr const char* kOpTypeFilter = "ORT_DEBUG_NODE_IO_OP_TYPE_FILTER";
// set to non-zero to dump data to files instead of stdout
constexpr const char* kDumpDataToFiles = "ORT_DEBUG_NODE_IO_DUMP_DATA_TO_FILES";
// set to "stdout" or "files" or "sqlite" to select dump destination
constexpr const char* kDumpDataDestination = "ORT_DEBUG_NODE_IO_DUMP_DATA_DESTINATION";
// set to non-zero to append OpenMPI world rank to filename
constexpr const char* kAppendRankToFileName = "ORT_DEBUG_NODE_IO_APPEND_RANK_TO_FILE_NAME";
// specify the output directory for any data files produced
constexpr const char* kOutputDir = "ORT_DEBUG_NODE_IO_OUTPUT_DIR";
// specify the file prefix for sqlite3 db (process id will be appended)
constexpr const char* kSqliteDbPrefix = "ORT_DEBUG_NODE_IO_SQLITE_DB_PREFIX";
// set to non-zero to confirm that dumping data files for all nodes is acceptable
constexpr const char* kDumpingDataToFilesForAllNodesIsOk =
"ORT_DEBUG_NODE_IO_DUMPING_DATA_TO_FILES_FOR_ALL_NODES_IS_OK";
@ -51,7 +63,8 @@ struct NodeDumpOptions {
Shape = 1 << 0,
InputData = 1 << 1,
OutputData = 1 << 2,
AllData = Shape | InputData | OutputData,
NodePlacement = 1 << 3,
AllData = Shape | InputData | OutputData | NodePlacement,
};
// specifies the information to dump per node
@ -80,11 +93,22 @@ struct NodeDumpOptions {
StdOut,
// write to one file per tensor input/output as a TensorProto
TensorProtoFiles,
// write to one row per tensor input/output in Sqlite table
SqliteDb
} data_destination{DataDestination::StdOut};
std::string file_suffix;
// the output directory for dumped data files
Path output_dir;
// the sqlite3 db to append dumped data
Path sqlite_db_prefix;
};
struct NodeDumpContext {
// which execution pass are we on?
size_t iteration;
// which node are we on?
size_t program_counter;
};
// gets NodeDumpOptions instance configured from environment variable values
@ -93,18 +117,30 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables();
// dumps inputs for a node
void DumpNodeInputs(
const NodeDumpOptions& dump_options,
const OpKernelContext& context, const Node& node, const SessionState& session_state);
const NodeDumpContext& dump_context,
const OpKernelContext& context,
const Node& node,
const SessionState& session_state);
void DumpNodeInputs(
const OpKernelContext& context, const Node& node, const SessionState& session_state);
const NodeDumpContext& dump_context,
const OpKernelContext& context,
const Node& node,
const SessionState& session_state);
// dumps outputs for a node
void DumpNodeOutputs(
const NodeDumpOptions& dump_options,
OpKernelContext& context, const Node& node, const SessionState& session_state);
const NodeDumpContext& dump_context,
OpKernelContext& context,
const Node& node,
const SessionState& session_state);
void DumpNodeOutputs(
OpKernelContext& context, const Node& node, const SessionState& session_state);
const NodeDumpContext& dump_context,
OpKernelContext& context,
const Node& node,
const SessionState& session_state);
} // namespace utils
} // namespace onnxruntime

View file

@ -184,6 +184,11 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
profile::Color::Black);
#endif
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
utils::NodeDumpContext dump_context { session_state.GetGraphExecutionCounter(), 0 };
#endif
for (size_t program_counter = state_.GetProgramCounterStart();
program_counter < state_.GetProgramCounterEnd();
program_counter += 1) {
@ -287,7 +292,8 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
}
}
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node(), session_state);
dump_context.program_counter = program_counter;
utils::DumpNodeInputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
#endif
const std::string node_name_for_profiling = [&]() -> std::string {
@ -440,7 +446,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
}
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
utils::DumpNodeOutputs(op_kernel_context, p_op_kernel->Node(), session_state);
utils::DumpNodeOutputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
#endif
// free ml-values corresponding to this node

View file

@ -186,6 +186,12 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
profile::Color::Black);
#endif
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
size_t program_counter = 0;
utils::NodeDumpContext dump_context { session_state.GetGraphExecutionCounter(), program_counter };
#endif
for (const auto& node_exec_plan : exec_plan_vec) {
if (terminate_flag_) {
LOGS(logger, WARNING) << "Exiting due to terminate flag being set to true.";
@ -271,7 +277,8 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
}
}
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node(), session_state);
dump_context.program_counter = program_counter++;
utils::DumpNodeInputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
#endif
const std::string node_name_for_profiling = [&]() -> std::string {
@ -418,7 +425,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
}
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
utils::DumpNodeOutputs(op_kernel_context, p_op_kernel->Node(), session_state);
utils::DumpNodeOutputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
#endif
// free ml-values corresponding to this node

View file

@ -101,6 +101,7 @@ class SessionState {
use_deterministic_compute_(use_deterministic_compute),
enable_mem_reuse_(enable_mem_reuse),
prepacked_weights_container_(prepacked_weights_container) {
SetupAllocators();
}
@ -317,6 +318,16 @@ class SessionState {
return used_shared_pre_packed_weights_counter_;
}
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
void IncrementGraphExecutionCounter() {
++graph_executions_counter_;
}
size_t GetGraphExecutionCounter() const {
return graph_executions_counter_;
}
#endif
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState);
@ -502,6 +513,11 @@ class SessionState {
// Counter for number of times a shared version of the pre-packed weight corresponding to
// a constant initialized weight was used by the session state
size_t used_shared_pre_packed_weights_counter_ = 0;
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
// Counter for number of times the session graph has been executed
size_t graph_executions_counter_ = 0;
#endif
};
} // namespace onnxruntime

View file

@ -1691,6 +1691,11 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
ORT_ENFORCE(session_options_.execution_mode == ExecutionMode::ORT_SEQUENTIAL, "Only sequential mode is supported.");
// execute the graph
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
if (state.GetProgramCounterStart() == 0) {
session_state_->IncrementGraphExecutionCounter();
}
#endif
ORT_CHECK_AND_SET_RETVAL(utils::ExecutePartialGraph(*session_state_, feeds_fetches_manager, feeds, fetches,
run_logger, state, cache));
}
@ -1801,6 +1806,9 @@ Status InferenceSession::Run(const RunOptions& run_options,
#endif
// execute the graph
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
session_state_->IncrementGraphExecutionCounter();
#endif
ORT_CHECK_AND_SET_RETVAL(utils::ExecuteGraph(*session_state_, feeds_fetches_manager, feeds, *p_fetches,
session_options_.execution_mode, run_options.terminate, run_logger,
run_options.only_execute_path_to_fetches));

View file

@ -43,7 +43,7 @@ TEST(DebugNodeInputsOutputs, BasicFileOutput) {
{env_vars::kDumpOutputData, "1"},
{env_vars::kNameFilter, nullopt},
{env_vars::kOpTypeFilter, nullopt},
{env_vars::kDumpDataToFiles, "1"},
{env_vars::kDumpDataDestination, "files"},
{env_vars::kAppendRankToFileName, nullopt},
{env_vars::kOutputDir, ToMBString(temp_dir.Path())},
{env_vars::kDumpingDataToFilesForAllNodesIsOk, "1"},

View file

@ -0,0 +1,19 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import sqlite3
import onnx
from onnx import numpy_helper
connection = sqlite3.connect('<path-to-sqldb-from-tracing>', detect_types=sqlite3.PARSE_DECLTYPES)
def convert_tensor_proto_to_numpy_array(blob):
tensor_proto = onnx.TensorProto()
tensor_proto.ParseFromString(blob)
return numpy_helper.to_array(tensor_proto)
sqlite3.register_converter("TensorProto", convert_tensor_proto_to_numpy_array)
for step, name, value, device, producer, consumers in connection.execute(
'Select Step, Name, Value, DeviceType, TracedProducer, TracedConsumers from Tensors'):
print(step, name, value.shape, consumers)