mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
Extend node debugging utilities to push tensors and node placement to SQL database (#8672)
* adding support for tracing to sqldb instead of files * use compiled statements * script to pull tensors from db * link sqlite3 * remove node info redundant with onnx graph * addressing PR comments * address PR comments and include program counter * third party notice * use find_pacakge * add to cgmanifests.json * address thread safety and add pid suffix * build fi * python script to select on devicetype * remove unpopulated and redundant Shape and Type fields * comment * comment * PR comments * add graph execution counter to session state * move increment to inference session * std::endl to \n * ifdef on graph execution counter * add ifdef to inference session * move DEBUG_NODE_INPUTS_OUTPUTS to CMakeLists.txt
This commit is contained in:
parent
4666a49106
commit
9fa0d8392a
13 changed files with 463 additions and 37 deletions
|
|
@ -4747,3 +4747,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
_____
|
||||
|
||||
SQLite Is Public Domain
|
||||
|
||||
All of the code and documentation in SQLite has been dedicated to the public
|
||||
domain by the authors. All code authors, and representatives of the companies
|
||||
they work for, have signed affidavits dedicating their contributions to the
|
||||
public domain and originals of those signed affidavits are stored in a firesafe
|
||||
at the main offices of Hwaci. Anyone is free to copy, modify, publish, use,
|
||||
compile, sell, or distribute the original SQLite code, either in source code
|
||||
form or as a compiled binary, for any purpose, commercial or non-commercial,
|
||||
and by any means.
|
||||
|
||||
The previous paragraph applies to the deliverable code and documentation in
|
||||
SQLite - those parts of the SQLite library that you actually bundle and ship
|
||||
with a larger application. Some scripts used as part of the build process (for
|
||||
example the "configure" scripts generated by autoconf) might fall under other
|
||||
open-source licenses. Nothing from these build scripts ever reaches the final
|
||||
deliverable SQLite library, however, and so the licenses associated with those
|
||||
scripts should not be a factor in assessing your rights to copy and use the
|
||||
SQLite library.
|
||||
|
||||
All of the deliverable code in SQLite has been written from scratch. No code
|
||||
has been taken from other projects or from the open internet. Every line of
|
||||
code can be traced back to its original author, and all of those authors have
|
||||
public domain dedications on file. So the SQLite code base is clean and is
|
||||
uncontaminated with licensed code from other projects.
|
||||
|
|
|
|||
|
|
@ -469,7 +469,17 @@
|
|||
},
|
||||
"comments": "dlpack"
|
||||
}
|
||||
},
|
||||
{
|
||||
"component": {
|
||||
"Type": "other",
|
||||
"Other": {
|
||||
"Name": "SQLite3",
|
||||
"Version": "3.22.0",
|
||||
"DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"Version": 1
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,6 +83,7 @@ cmake_dependent_option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instea
|
|||
option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
|
||||
option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
|
||||
option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
|
||||
cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
|
||||
option(onnxruntime_USE_DML "Build with DirectML support" OFF)
|
||||
option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
|
||||
option(onnxruntime_USE_WINML "Build with WinML support" OFF)
|
||||
|
|
@ -1835,3 +1836,8 @@ if (onnxruntime_BUILD_OPSCHEMA_LIB AND onnxruntime_ENABLE_TRAINING)
|
|||
# opschema library requires training ops as well
|
||||
include(onnxruntime_opschema_lib.cmake)
|
||||
endif()
|
||||
|
||||
if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS)
|
||||
add_compile_definitions(DEBUG_NODE_INPUTS_OUTPUTS)
|
||||
endif()
|
||||
|
||||
|
|
|
|||
|
|
@ -75,9 +75,15 @@ if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_BUI
|
|||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
|
||||
endif()
|
||||
|
||||
if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS)
|
||||
target_compile_definitions(onnxruntime_framework PRIVATE DEBUG_NODE_INPUTS_OUTPUTS)
|
||||
if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB)
|
||||
find_package (SQLite3)
|
||||
if (SQLITE3_FOUND)
|
||||
include_directories(${SQLite3_INCLUDE_DIR})
|
||||
target_link_libraries (onnxruntime_framework ${SQLite3_LIBRARY})
|
||||
else()
|
||||
message( FATAL_ERROR "Could not locate SQLite3 package." )
|
||||
endif (SQLITE3_FOUND)
|
||||
target_compile_definitions(onnxruntime_framework PRIVATE DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB)
|
||||
endif()
|
||||
|
||||
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ endif()
|
|||
if (onnxruntime_ENABLE_TRAINING OR onnxruntime_ENABLE_TRAINING_OPS)
|
||||
target_include_directories(onnxruntime_session PRIVATE ${ORTTRAINING_ROOT})
|
||||
endif()
|
||||
|
||||
if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
|
||||
onnxruntime_add_include_to_target(onnxruntime_session Python::Module)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -7,6 +7,11 @@
|
|||
|
||||
#include <iomanip>
|
||||
#include <cctype>
|
||||
#include <string>
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
|
||||
#include <sqlite3.h>
|
||||
#endif
|
||||
|
||||
#include "core/common/path_utils.h"
|
||||
#include "core/framework/tensorprotoutils.h"
|
||||
|
|
@ -18,6 +23,15 @@ namespace utils {
|
|||
|
||||
namespace {
|
||||
|
||||
struct TensorMetadata {
|
||||
|
||||
std::string name;
|
||||
std::string producer;
|
||||
std::string consumer;
|
||||
std::string device_type;
|
||||
size_t step;
|
||||
};
|
||||
|
||||
bool FilterNode(const NodeDumpOptions& dump_options, const Node& node) {
|
||||
auto match_pattern =
|
||||
[](const std::string& value, const std::string& delimited_patterns) {
|
||||
|
|
@ -117,17 +131,230 @@ void DumpTensorToFile(const Tensor& tensor, const std::string& tensor_name, cons
|
|||
ORT_THROW_IF_ERROR(Env::Default().FileClose(output_fd));
|
||||
}
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
|
||||
sqlite3* SqliteConnection() {
|
||||
|
||||
static thread_local std::unique_ptr<sqlite3, decltype(&sqlite3_close)> sqlite_db(
|
||||
[]() {
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "-pid" << Env::Default().GetSelfPid() << ".db";
|
||||
const auto& opt = NodeDumpOptionsFromEnvironmentVariables();
|
||||
auto sqlite_db_prefix = opt.sqlite_db_prefix;
|
||||
auto sqlite_db_path = sqlite_db_prefix.Concat(ss.str()).ToPathString();
|
||||
|
||||
sqlite3 *db;
|
||||
int rc = sqlite3_open(sqlite_db_path.c_str(), &db);
|
||||
ORT_ENFORCE(rc == SQLITE_OK, "Failed to connect to sqlite3 db ", sqlite_db_path.c_str());
|
||||
|
||||
const char *sql_create_tensor_table =
|
||||
"Create table if not exists Tensors ( "
|
||||
" Step int not null, "
|
||||
" Name text not null, "
|
||||
" Value TensorProto, "
|
||||
" DeviceType text, "
|
||||
" TracedProducer NodeArg, "
|
||||
" TracedConsumers NodeArgList, "
|
||||
" primary key (Step, Name) "
|
||||
");";
|
||||
|
||||
const char *error_message = nullptr;
|
||||
rc = sqlite3_exec(db, sql_create_tensor_table, nullptr, 0, (char**)&error_message);
|
||||
ORT_ENFORCE(rc == SQLITE_OK,
|
||||
"Failed to create Tensors table in sqlite3 db ", sqlite_db_path.c_str(),
|
||||
" on ", error_message);
|
||||
|
||||
const char *sql_create_node_table =
|
||||
"Create table if not exists Nodes ( "
|
||||
" ExecutionCounter int, "
|
||||
" Name text primary key not null, "
|
||||
" OpType text not null, "
|
||||
" ExecutionProvider text "
|
||||
");";
|
||||
|
||||
rc = sqlite3_exec(db, sql_create_node_table, nullptr, 0, (char**)&error_message);
|
||||
ORT_ENFORCE(rc == SQLITE_OK,
|
||||
"Failed to create Nodes table in sqlite3 db ", sqlite_db_path.c_str(),
|
||||
" on ", error_message);
|
||||
|
||||
return db;
|
||||
}(), &sqlite3_close);
|
||||
|
||||
return sqlite_db.get();
|
||||
}
|
||||
|
||||
#define SQL_OK(command) \
|
||||
ORT_ENFORCE((command) == SQLITE_OK, "Failed sql operation on ", sqlite3_errmsg(SqliteConnection()))
|
||||
|
||||
void SqlStepWithRetry(sqlite3_stmt* stmt, int sql_expected) {
|
||||
|
||||
int attempt = 0;
|
||||
while (true) {
|
||||
|
||||
int rc = sqlite3_step(stmt);
|
||||
if (rc == sql_expected) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (rc == SQLITE_BUSY || rc == SQLITE_LOCKED) {
|
||||
if (attempt % 10000 == 0) {
|
||||
std::cerr << "Warning: Pid " << Env::Default().GetSelfPid()
|
||||
<< " gently spinning on sql db busy or locked\n";
|
||||
}
|
||||
Env::Default().SleepForMicroseconds(100);
|
||||
attempt++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ORT_THROW("Failed sql step for ", sqlite3_expanded_sql(stmt), " on ", sqlite3_errmsg(SqliteConnection()));
|
||||
}
|
||||
}
|
||||
|
||||
bool TensorExistsInSqlDb(const TensorMetadata& tensor_metadata) {
|
||||
|
||||
static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
|
||||
|
||||
sqlite3 *db = SqliteConnection();
|
||||
|
||||
const char *sql_tensor_exists =
|
||||
"select count(name) from Tensors where Name == ? and Step == ?;";
|
||||
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
SQL_OK(sqlite3_prepare_v2(db, sql_tensor_exists, -1, &stmt, nullptr));
|
||||
|
||||
return stmt;
|
||||
}(), &sqlite3_finalize);
|
||||
|
||||
sqlite3_stmt* stmt = stmt_uptr.get();
|
||||
|
||||
SQL_OK(sqlite3_reset(stmt));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 1, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
|
||||
SQL_OK(sqlite3_bind_int(stmt, 2, (int)tensor_metadata.step));
|
||||
SqlStepWithRetry(stmt, SQLITE_ROW);
|
||||
bool exists = sqlite3_column_int(stmt, 0) > 0;
|
||||
SqlStepWithRetry(stmt, SQLITE_DONE);
|
||||
|
||||
return exists;
|
||||
}
|
||||
|
||||
void InsertTensorInSqlDb(const Tensor& tensor, const TensorMetadata& tensor_metadata) {
|
||||
|
||||
static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
|
||||
|
||||
sqlite3 *db = SqliteConnection();
|
||||
|
||||
const char *sql_insert_tensor =
|
||||
"Insert into Tensors (Step, Name, Value, DeviceType, TracedProducer, TracedConsumers) "
|
||||
" values (?, ?, ?, ?, \"\", \"\"); ";
|
||||
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
SQL_OK(sqlite3_prepare_v2(db, sql_insert_tensor, -1, &stmt, nullptr));
|
||||
|
||||
return stmt;
|
||||
}(), &sqlite3_finalize);
|
||||
|
||||
sqlite3_stmt* stmt = stmt_uptr.get();
|
||||
|
||||
SQL_OK(sqlite3_reset(stmt));
|
||||
SQL_OK(sqlite3_bind_int(stmt, 1, tensor_metadata.step));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 2, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
|
||||
|
||||
auto tensor_proto = utils::TensorToTensorProto(tensor, tensor_metadata.name);
|
||||
std::string bytes = tensor_proto.SerializeAsString();
|
||||
const char* data = bytes.data();
|
||||
int size = bytes.size();
|
||||
|
||||
SQL_OK(sqlite3_bind_blob(stmt, 3, data, size, SQLITE_TRANSIENT));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 4, tensor_metadata.device_type.c_str(), -1, SQLITE_TRANSIENT));
|
||||
|
||||
SqlStepWithRetry(stmt, SQLITE_DONE);
|
||||
}
|
||||
|
||||
void UpdateTensorUsageInSqlDb(const TensorMetadata& tensor_metadata) {
|
||||
|
||||
static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
|
||||
|
||||
sqlite3 *db = SqliteConnection();
|
||||
|
||||
const char *sql_update_tensor =
|
||||
"Update Tensors set "
|
||||
" TracedProducer = TracedProducer || ?, "
|
||||
" TracedConsumers = TracedConsumers || ? "
|
||||
"where Name = ? and Step = ?;";
|
||||
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
SQL_OK(sqlite3_prepare_v2(db, sql_update_tensor, -1, &stmt, nullptr));
|
||||
|
||||
return stmt;
|
||||
}(), &sqlite3_finalize);
|
||||
|
||||
sqlite3_stmt* stmt = stmt_uptr.get();
|
||||
|
||||
SQL_OK(sqlite3_reset(stmt));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 1, tensor_metadata.producer.c_str(), -1, SQLITE_TRANSIENT));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 2, tensor_metadata.consumer.c_str(), -1, SQLITE_TRANSIENT));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 3, tensor_metadata.name.c_str(), -1, SQLITE_TRANSIENT));
|
||||
SQL_OK(sqlite3_bind_int(stmt, 4, tensor_metadata.step));
|
||||
|
||||
SqlStepWithRetry(stmt, SQLITE_DONE);
|
||||
}
|
||||
|
||||
void DumpTensorToSqliteDb(const Tensor& tensor, const TensorMetadata& tensor_metadata) {
|
||||
|
||||
if (!TensorExistsInSqlDb(tensor_metadata)) {
|
||||
InsertTensorInSqlDb(tensor, tensor_metadata);
|
||||
}
|
||||
|
||||
UpdateTensorUsageInSqlDb(tensor_metadata);
|
||||
}
|
||||
|
||||
void InsertNodePlacementToSqliteDb(const NodeDumpContext& dump_context, const Node& node) {
|
||||
|
||||
static thread_local std::unique_ptr<sqlite3_stmt, decltype(&sqlite3_finalize)> stmt_uptr( [](){
|
||||
|
||||
sqlite3 *db = SqliteConnection();
|
||||
|
||||
const char *sql_insert_node =
|
||||
"Insert or Ignore into Nodes (ExecutionCounter, Name, OpType, ExecutionProvider) "
|
||||
" values (?, ?, ?, ?);";
|
||||
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
SQL_OK(sqlite3_prepare_v2(db, sql_insert_node, -1, &stmt, nullptr));
|
||||
|
||||
return stmt;
|
||||
}(), &sqlite3_finalize);
|
||||
|
||||
sqlite3_stmt* stmt = stmt_uptr.get();
|
||||
|
||||
SQL_OK(sqlite3_reset(stmt));
|
||||
SQL_OK(sqlite3_bind_int(stmt, 1, dump_context.program_counter));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 2, node.Name().c_str(), -1, SQLITE_TRANSIENT));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 3, node.OpType().c_str(), -1, SQLITE_TRANSIENT));
|
||||
SQL_OK(sqlite3_bind_text(stmt, 4, node.GetExecutionProviderType().c_str(), -1, SQLITE_TRANSIENT));
|
||||
|
||||
SqlStepWithRetry(stmt, SQLITE_DONE);
|
||||
}
|
||||
#endif // DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
|
||||
|
||||
void DumpCpuTensor(
|
||||
const NodeDumpOptions& dump_options,
|
||||
const Tensor& tensor, const std::string& tensor_name) {
|
||||
const Tensor& tensor, const TensorMetadata& tensor_metadata) {
|
||||
switch (dump_options.data_destination) {
|
||||
case NodeDumpOptions::DataDestination::StdOut: {
|
||||
DispatchOnTensorType(tensor.DataType(), DumpTensorToStdOut, tensor);
|
||||
break;
|
||||
}
|
||||
case NodeDumpOptions::DataDestination::TensorProtoFiles: {
|
||||
const Path tensor_file = dump_options.output_dir / Path::Parse(MakeTensorFileName(tensor_name, dump_options));
|
||||
DumpTensorToFile(tensor, tensor_name, tensor_file);
|
||||
const Path tensor_file = dump_options.output_dir / Path::Parse(MakeTensorFileName(tensor_metadata.name, dump_options));
|
||||
DumpTensorToFile(tensor, tensor_metadata.name, tensor_file);
|
||||
break;
|
||||
}
|
||||
case NodeDumpOptions::DataDestination::SqliteDb: {
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
|
||||
DumpTensorToSqliteDb(tensor, tensor_metadata);
|
||||
#else
|
||||
ORT_THROW("Recompile with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1 onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB=1");
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
|
@ -137,14 +364,15 @@ void DumpCpuTensor(
|
|||
|
||||
void DumpTensor(
|
||||
const NodeDumpOptions& dump_options,
|
||||
const Tensor& tensor, const std::string& tensor_name,
|
||||
const Tensor& tensor, TensorMetadata& tensor_metadata,
|
||||
const SessionState& session_state) {
|
||||
// check tensor is on CPU before dumping it
|
||||
auto& tensor_location = tensor.Location();
|
||||
if (tensor_location.device.Type() == OrtDevice::CPU ||
|
||||
tensor_location.mem_type == OrtMemTypeCPUInput ||
|
||||
tensor_location.mem_type == OrtMemTypeCPUOutput) {
|
||||
DumpCpuTensor(dump_options, tensor, tensor_name);
|
||||
tensor_metadata.device_type = "CPU";
|
||||
DumpCpuTensor(dump_options, tensor, tensor_metadata);
|
||||
} else {
|
||||
std::cout << tensor_location << "\n";
|
||||
|
||||
|
|
@ -159,7 +387,8 @@ void DumpTensor(
|
|||
const auto& data_transfer_mgr = session_state.GetDataTransferMgr();
|
||||
auto status = data_transfer_mgr.CopyTensor(tensor, cpu_tensor);
|
||||
if (status == common::Status::OK()) {
|
||||
DumpCpuTensor(dump_options, cpu_tensor, tensor_name);
|
||||
tensor_metadata.device_type = "GPU";
|
||||
DumpCpuTensor(dump_options, cpu_tensor, tensor_metadata);
|
||||
} else {
|
||||
std::cout << " failed to transfer data to cpu.\n";
|
||||
}
|
||||
|
|
@ -191,13 +420,25 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables() {
|
|||
if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpOutputData, false)) {
|
||||
opts.dump_flags |= NodeDumpOptions::DumpFlags::OutputData;
|
||||
}
|
||||
if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpNodePlacement, true)) {
|
||||
opts.dump_flags |= NodeDumpOptions::DumpFlags::NodePlacement;
|
||||
}
|
||||
|
||||
opts.filter.name_pattern = Env::Default().GetEnvironmentVar(env_vars::kNameFilter);
|
||||
opts.filter.op_type_pattern = Env::Default().GetEnvironmentVar(env_vars::kOpTypeFilter);
|
||||
|
||||
if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kDumpDataToFiles, false)) {
|
||||
const std::string destination = ParseEnvironmentVariableWithDefault<std::string>(
|
||||
env_vars::kDumpDataDestination, "stdout");
|
||||
|
||||
if (destination == "files") {
|
||||
opts.data_destination = NodeDumpOptions::DataDestination::TensorProtoFiles;
|
||||
}
|
||||
else if (destination == "sqlite") {
|
||||
opts.data_destination = NodeDumpOptions::DataDestination::SqliteDb;
|
||||
}
|
||||
else if (destination != "stdout") {
|
||||
ORT_THROW("Unsupported data destination type: ", destination);
|
||||
}
|
||||
|
||||
if (ParseEnvironmentVariableWithDefault<bool>(env_vars::kAppendRankToFileName, false)) {
|
||||
std::string rank = Env::Default().GetEnvironmentVar("OMPI_COMM_WORLD_RANK");
|
||||
|
|
@ -210,6 +451,10 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables() {
|
|||
|
||||
opts.output_dir = Path::Parse(ToPathString(Env::Default().GetEnvironmentVar(env_vars::kOutputDir)));
|
||||
|
||||
std::string sqlite_db_prefix =
|
||||
ParseEnvironmentVariableWithDefault<std::string>(env_vars::kSqliteDbPrefix, "execution-trace");
|
||||
opts.sqlite_db_prefix = Path::Parse(ToPathString(sqlite_db_prefix));
|
||||
|
||||
// check for confirmation for dumping data to files for all nodes
|
||||
const bool is_input_or_output_requested = ((opts.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) ||
|
||||
((opts.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0);
|
||||
|
|
@ -241,8 +486,11 @@ static void PrintIf(bool boolean_expression, const std::string& message) {
|
|||
}
|
||||
|
||||
void DumpNodeInputs(
|
||||
const NodeDumpOptions& dump_options,
|
||||
const OpKernelContext& context, const Node& node, const SessionState& session_state) {
|
||||
const NodeDumpOptions& dump_options,
|
||||
const NodeDumpContext& dump_context,
|
||||
const OpKernelContext& context,
|
||||
const Node& node,
|
||||
const SessionState& session_state) {
|
||||
const bool is_any_output_dumped = IsAnyOutputDumped(dump_options);
|
||||
if (!is_any_output_dumped) {
|
||||
return;
|
||||
|
|
@ -250,14 +498,23 @@ void DumpNodeInputs(
|
|||
|
||||
if (!FilterNode(dump_options, node)) return;
|
||||
|
||||
bool should_dump_node_placement = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::NodePlacement) != 0;
|
||||
if (dump_context.iteration == 1 && should_dump_node_placement) {
|
||||
PrintIf(should_dump_node_placement, MakeString(" Placement: ", node.GetExecutionProviderType(), "\n"));
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
|
||||
InsertNodePlacementToSqliteDb(dump_context, node);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::cout << "-----------\n";
|
||||
std::cout << node.OpType() << " node: " << node.Name() << "\n";
|
||||
|
||||
const auto& input_defs = node.InputDefs();
|
||||
TensorMetadata tensor_metadata;
|
||||
|
||||
for (auto i = 0, end = context.InputCount(); i < end; ++i) {
|
||||
if (input_defs[i]->Exists()) {
|
||||
std::cout << "Input " << i << " Name: " << input_defs[i]->Name();
|
||||
std::cout << "Input " << i << " Name: " << input_defs[i]->Name() << "\n";
|
||||
|
||||
const auto* type = context.InputType(i);
|
||||
|
||||
|
|
@ -270,7 +527,10 @@ void DumpNodeInputs(
|
|||
PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
|
||||
|
||||
if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::InputData) != 0) {
|
||||
DumpTensor(dump_options, tensor, input_defs[i]->Name(), session_state);
|
||||
tensor_metadata.name = input_defs[i]->Name();
|
||||
tensor_metadata.step = dump_context.iteration;
|
||||
tensor_metadata.consumer = node.Name() + ":" + std::to_string(i);
|
||||
DumpTensor(dump_options, tensor, tensor_metadata, session_state);
|
||||
}
|
||||
} else {
|
||||
std::cout << " is non-tensor type.\n";
|
||||
|
|
@ -286,11 +546,19 @@ void DumpNodeInputs(
|
|||
}
|
||||
|
||||
void DumpNodeInputs(
|
||||
const OpKernelContext& context, const Node& node, const SessionState& session_state) {
|
||||
DumpNodeInputs(NodeDumpOptionsFromEnvironmentVariables(), context, node, session_state);
|
||||
const NodeDumpContext& dump_context,
|
||||
const OpKernelContext& context,
|
||||
const Node& node,
|
||||
const SessionState& session_state) {
|
||||
DumpNodeInputs(NodeDumpOptionsFromEnvironmentVariables(), dump_context, context, node, session_state);
|
||||
}
|
||||
|
||||
void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& context, const Node& node, const SessionState& session_state) {
|
||||
void DumpNodeOutputs(
|
||||
const NodeDumpOptions& dump_options,
|
||||
const NodeDumpContext& dump_context,
|
||||
OpKernelContext& context,
|
||||
const Node& node,
|
||||
const SessionState& session_state) {
|
||||
const bool is_any_output_dumped = IsAnyOutputDumped(dump_options);
|
||||
if (!is_any_output_dumped) {
|
||||
return;
|
||||
|
|
@ -298,12 +566,21 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
|
|||
|
||||
if (!FilterNode(dump_options, node)) return;
|
||||
|
||||
bool should_dump_node_placement = (dump_options.dump_flags & NodeDumpOptions::DumpFlags::NodePlacement) != 0;
|
||||
if (dump_context.iteration == 1 && should_dump_node_placement) {
|
||||
PrintIf(should_dump_node_placement, MakeString(" Placement: ", node.GetExecutionProviderType(), "\n"));
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB
|
||||
InsertNodePlacementToSqliteDb(dump_context, node);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::cout << "-----------\n";
|
||||
const auto& output_defs = node.OutputDefs();
|
||||
TensorMetadata tensor_metadata;
|
||||
|
||||
for (auto i = 0, end = context.OutputCount(); i < end; ++i) {
|
||||
if (output_defs[i]->Exists()) {
|
||||
std::cout << "Output " << i << " Name: " << output_defs[i]->Name();
|
||||
std::cout << "Output " << i << " Name: " << output_defs[i]->Name() << "\n";
|
||||
|
||||
const auto* type = context.OutputType(i);
|
||||
if (type) {
|
||||
|
|
@ -315,7 +592,10 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
|
|||
PrintIf(is_shape_set, MakeString(" Shape: ", shape, "\n"));
|
||||
|
||||
if ((dump_options.dump_flags & NodeDumpOptions::DumpFlags::OutputData) != 0) {
|
||||
DumpTensor(dump_options, tensor, output_defs[i]->Name(), session_state);
|
||||
tensor_metadata.name = output_defs[i]->Name();
|
||||
tensor_metadata.step = dump_context.iteration;
|
||||
tensor_metadata.producer = node.Name() + ":" + std::to_string(i);
|
||||
DumpTensor(dump_options, tensor, tensor_metadata, session_state);
|
||||
}
|
||||
} else {
|
||||
std::cout << " is non-tensor type.\n";
|
||||
|
|
@ -333,8 +613,11 @@ void DumpNodeOutputs(const NodeDumpOptions& dump_options, OpKernelContext& conte
|
|||
}
|
||||
|
||||
void DumpNodeOutputs(
|
||||
OpKernelContext& context, const Node& node, const SessionState& session_state) {
|
||||
DumpNodeOutputs(NodeDumpOptionsFromEnvironmentVariables(), context, node, session_state);
|
||||
const NodeDumpContext& dump_context,
|
||||
OpKernelContext& context,
|
||||
const Node& node,
|
||||
const SessionState& session_state) {
|
||||
DumpNodeOutputs(NodeDumpOptionsFromEnvironmentVariables(), dump_context, context, node, session_state);
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
|
|
|
|||
|
|
@ -4,6 +4,14 @@
|
|||
// to create a build with these enabled run the build script with:
|
||||
// --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1
|
||||
|
||||
// to enable redirect to sqlite database run the build script with:
|
||||
// --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=1 onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB=1
|
||||
//
|
||||
// see orttraining/tools/scripts/sqldb_to_tensors.py for retrieval
|
||||
//
|
||||
// select data dump destination using
|
||||
// ORT_DEBUG_NODE_IO_DUMP_DATA_DESTINATION= one of {stdout, files, sqlite}
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
|
||||
#pragma once
|
||||
|
|
@ -18,10 +26,12 @@ namespace utils {
|
|||
|
||||
// environment variables that control debug node dumping behavior
|
||||
namespace debug_node_inputs_outputs_env_vars {
|
||||
// Shape is printed by default unless it's turned OFF by setting environment
|
||||
// variable ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA to 0.
|
||||
// set to non-zero to dump shape data
|
||||
// Tensor shape and Node placement is printed by default unless it's turned OFF
|
||||
// by setting the respective environment variables to 0
|
||||
// set to non-zero to dump tensor shape data
|
||||
constexpr const char* kDumpShapeData = "ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA";
|
||||
// set to non-zero to dump node placement data
|
||||
constexpr const char* kDumpNodePlacement = "ORT_DEBUG_NODE_IO_DUMP_NODE_PLACEMENT";
|
||||
// set to non-zero to dump node input data
|
||||
constexpr const char* kDumpInputData = "ORT_DEBUG_NODE_IO_DUMP_INPUT_DATA";
|
||||
// set to non-zero to dump node output data
|
||||
|
|
@ -32,12 +42,14 @@ constexpr const char* kNameFilter = "ORT_DEBUG_NODE_IO_NAME_FILTER";
|
|||
// specify a node op type filter to limit the nodes that are dumped
|
||||
// see NodeDumpOptions::FilterOptions
|
||||
constexpr const char* kOpTypeFilter = "ORT_DEBUG_NODE_IO_OP_TYPE_FILTER";
|
||||
// set to non-zero to dump data to files instead of stdout
|
||||
constexpr const char* kDumpDataToFiles = "ORT_DEBUG_NODE_IO_DUMP_DATA_TO_FILES";
|
||||
// set to "stdout" or "files" or "sqlite" to select dump destination
|
||||
constexpr const char* kDumpDataDestination = "ORT_DEBUG_NODE_IO_DUMP_DATA_DESTINATION";
|
||||
// set to non-zero to append OpenMPI world rank to filename
|
||||
constexpr const char* kAppendRankToFileName = "ORT_DEBUG_NODE_IO_APPEND_RANK_TO_FILE_NAME";
|
||||
// specify the output directory for any data files produced
|
||||
constexpr const char* kOutputDir = "ORT_DEBUG_NODE_IO_OUTPUT_DIR";
|
||||
// specify the file prefix for sqlite3 db (process id will be appended)
|
||||
constexpr const char* kSqliteDbPrefix = "ORT_DEBUG_NODE_IO_SQLITE_DB_PREFIX";
|
||||
// set to non-zero to confirm that dumping data files for all nodes is acceptable
|
||||
constexpr const char* kDumpingDataToFilesForAllNodesIsOk =
|
||||
"ORT_DEBUG_NODE_IO_DUMPING_DATA_TO_FILES_FOR_ALL_NODES_IS_OK";
|
||||
|
|
@ -51,7 +63,8 @@ struct NodeDumpOptions {
|
|||
Shape = 1 << 0,
|
||||
InputData = 1 << 1,
|
||||
OutputData = 1 << 2,
|
||||
AllData = Shape | InputData | OutputData,
|
||||
NodePlacement = 1 << 3,
|
||||
AllData = Shape | InputData | OutputData | NodePlacement,
|
||||
};
|
||||
|
||||
// specifies the information to dump per node
|
||||
|
|
@ -80,11 +93,22 @@ struct NodeDumpOptions {
|
|||
StdOut,
|
||||
// write to one file per tensor input/output as a TensorProto
|
||||
TensorProtoFiles,
|
||||
// write to one row per tensor input/output in Sqlite table
|
||||
SqliteDb
|
||||
} data_destination{DataDestination::StdOut};
|
||||
|
||||
std::string file_suffix;
|
||||
// the output directory for dumped data files
|
||||
Path output_dir;
|
||||
// the sqlite3 db to append dumped data
|
||||
Path sqlite_db_prefix;
|
||||
};
|
||||
|
||||
struct NodeDumpContext {
|
||||
// which execution pass are we on?
|
||||
size_t iteration;
|
||||
// which node are we on?
|
||||
size_t program_counter;
|
||||
};
|
||||
|
||||
// gets NodeDumpOptions instance configured from environment variable values
|
||||
|
|
@ -93,18 +117,30 @@ const NodeDumpOptions& NodeDumpOptionsFromEnvironmentVariables();
|
|||
// dumps inputs for a node
|
||||
void DumpNodeInputs(
|
||||
const NodeDumpOptions& dump_options,
|
||||
const OpKernelContext& context, const Node& node, const SessionState& session_state);
|
||||
const NodeDumpContext& dump_context,
|
||||
const OpKernelContext& context,
|
||||
const Node& node,
|
||||
const SessionState& session_state);
|
||||
|
||||
void DumpNodeInputs(
|
||||
const OpKernelContext& context, const Node& node, const SessionState& session_state);
|
||||
const NodeDumpContext& dump_context,
|
||||
const OpKernelContext& context,
|
||||
const Node& node,
|
||||
const SessionState& session_state);
|
||||
|
||||
// dumps outputs for a node
|
||||
void DumpNodeOutputs(
|
||||
const NodeDumpOptions& dump_options,
|
||||
OpKernelContext& context, const Node& node, const SessionState& session_state);
|
||||
const NodeDumpContext& dump_context,
|
||||
OpKernelContext& context,
|
||||
const Node& node,
|
||||
const SessionState& session_state);
|
||||
|
||||
void DumpNodeOutputs(
|
||||
OpKernelContext& context, const Node& node, const SessionState& session_state);
|
||||
const NodeDumpContext& dump_context,
|
||||
OpKernelContext& context,
|
||||
const Node& node,
|
||||
const SessionState& session_state);
|
||||
|
||||
} // namespace utils
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -184,6 +184,11 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
|
|||
profile::Color::Black);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
utils::NodeDumpContext dump_context { session_state.GetGraphExecutionCounter(), 0 };
|
||||
#endif
|
||||
|
||||
|
||||
for (size_t program_counter = state_.GetProgramCounterStart();
|
||||
program_counter < state_.GetProgramCounterEnd();
|
||||
program_counter += 1) {
|
||||
|
|
@ -287,7 +292,8 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
|
|||
}
|
||||
}
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node(), session_state);
|
||||
dump_context.program_counter = program_counter;
|
||||
utils::DumpNodeInputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
|
||||
#endif
|
||||
|
||||
const std::string node_name_for_profiling = [&]() -> std::string {
|
||||
|
|
@ -440,7 +446,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
|
|||
}
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
utils::DumpNodeOutputs(op_kernel_context, p_op_kernel->Node(), session_state);
|
||||
utils::DumpNodeOutputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
|
||||
#endif
|
||||
|
||||
// free ml-values corresponding to this node
|
||||
|
|
|
|||
|
|
@ -186,6 +186,12 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
|
|||
profile::Color::Black);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
size_t program_counter = 0;
|
||||
utils::NodeDumpContext dump_context { session_state.GetGraphExecutionCounter(), program_counter };
|
||||
#endif
|
||||
|
||||
|
||||
for (const auto& node_exec_plan : exec_plan_vec) {
|
||||
if (terminate_flag_) {
|
||||
LOGS(logger, WARNING) << "Exiting due to terminate flag being set to true.";
|
||||
|
|
@ -271,7 +277,8 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
|
|||
}
|
||||
}
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node(), session_state);
|
||||
dump_context.program_counter = program_counter++;
|
||||
utils::DumpNodeInputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
|
||||
#endif
|
||||
|
||||
const std::string node_name_for_profiling = [&]() -> std::string {
|
||||
|
|
@ -418,7 +425,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
|
|||
}
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
utils::DumpNodeOutputs(op_kernel_context, p_op_kernel->Node(), session_state);
|
||||
utils::DumpNodeOutputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
|
||||
#endif
|
||||
|
||||
// free ml-values corresponding to this node
|
||||
|
|
|
|||
|
|
@ -101,6 +101,7 @@ class SessionState {
|
|||
use_deterministic_compute_(use_deterministic_compute),
|
||||
enable_mem_reuse_(enable_mem_reuse),
|
||||
prepacked_weights_container_(prepacked_weights_container) {
|
||||
|
||||
SetupAllocators();
|
||||
}
|
||||
|
||||
|
|
@ -317,6 +318,16 @@ class SessionState {
|
|||
return used_shared_pre_packed_weights_counter_;
|
||||
}
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
void IncrementGraphExecutionCounter() {
|
||||
++graph_executions_counter_;
|
||||
}
|
||||
|
||||
size_t GetGraphExecutionCounter() const {
|
||||
return graph_executions_counter_;
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState);
|
||||
|
||||
|
|
@ -502,6 +513,11 @@ class SessionState {
|
|||
// Counter for number of times a shared version of the pre-packed weight corresponding to
|
||||
// a constant initialized weight was used by the session state
|
||||
size_t used_shared_pre_packed_weights_counter_ = 0;
|
||||
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
// Counter for number of times the session graph has been executed
|
||||
size_t graph_executions_counter_ = 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -1691,6 +1691,11 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
|
|||
ORT_ENFORCE(session_options_.execution_mode == ExecutionMode::ORT_SEQUENTIAL, "Only sequential mode is supported.");
|
||||
|
||||
// execute the graph
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
if (state.GetProgramCounterStart() == 0) {
|
||||
session_state_->IncrementGraphExecutionCounter();
|
||||
}
|
||||
#endif
|
||||
ORT_CHECK_AND_SET_RETVAL(utils::ExecutePartialGraph(*session_state_, feeds_fetches_manager, feeds, fetches,
|
||||
run_logger, state, cache));
|
||||
}
|
||||
|
|
@ -1801,6 +1806,9 @@ Status InferenceSession::Run(const RunOptions& run_options,
|
|||
#endif
|
||||
|
||||
// execute the graph
|
||||
#ifdef DEBUG_NODE_INPUTS_OUTPUTS
|
||||
session_state_->IncrementGraphExecutionCounter();
|
||||
#endif
|
||||
ORT_CHECK_AND_SET_RETVAL(utils::ExecuteGraph(*session_state_, feeds_fetches_manager, feeds, *p_fetches,
|
||||
session_options_.execution_mode, run_options.terminate, run_logger,
|
||||
run_options.only_execute_path_to_fetches));
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ TEST(DebugNodeInputsOutputs, BasicFileOutput) {
|
|||
{env_vars::kDumpOutputData, "1"},
|
||||
{env_vars::kNameFilter, nullopt},
|
||||
{env_vars::kOpTypeFilter, nullopt},
|
||||
{env_vars::kDumpDataToFiles, "1"},
|
||||
{env_vars::kDumpDataDestination, "files"},
|
||||
{env_vars::kAppendRankToFileName, nullopt},
|
||||
{env_vars::kOutputDir, ToMBString(temp_dir.Path())},
|
||||
{env_vars::kDumpingDataToFilesForAllNodesIsOk, "1"},
|
||||
|
|
|
|||
19
orttraining/tools/scripts/sqldb_to_tensors.py
Normal file
19
orttraining/tools/scripts/sqldb_to_tensors.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import sqlite3
|
||||
import onnx
|
||||
from onnx import numpy_helper
|
||||
|
||||
connection = sqlite3.connect('<path-to-sqldb-from-tracing>', detect_types=sqlite3.PARSE_DECLTYPES)
|
||||
|
||||
def convert_tensor_proto_to_numpy_array(blob):
|
||||
tensor_proto = onnx.TensorProto()
|
||||
tensor_proto.ParseFromString(blob)
|
||||
return numpy_helper.to_array(tensor_proto)
|
||||
|
||||
sqlite3.register_converter("TensorProto", convert_tensor_proto_to_numpy_array)
|
||||
|
||||
for step, name, value, device, producer, consumers in connection.execute(
|
||||
'Select Step, Name, Value, DeviceType, TracedProducer, TracedConsumers from Tensors'):
|
||||
print(step, name, value.shape, consumers)
|
||||
Loading…
Reference in a new issue