diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 1901822011..39707cd944 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -762,12 +762,16 @@ class Graph { /** Returns the mutable parent graph if this is a subgraph */ Graph* MutableParentGraph() { return parent_graph_; } + /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */ + const Node* ParentNode() const { return parent_node_; } + /** Construct a Graph instance for a subgraph that is created from a GraphProto attribute in a Node. Inherits some properties from the parent graph. - @param parent_graph The Graph containing the Node which has a GraphProto attribute. + @param parent_graph The Graph containing the Node that has the GraphProto attribute. + @param parent_node The Node that has the GraphProto attribute. @param subgraph_proto The GraphProto from the Node attribute. */ - Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto); + Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto); virtual ~Graph(); @@ -794,6 +798,7 @@ class Graph { Version ir_version, IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph, + const Node* parent_node, const std::unordered_map& model_functions = {}); // Add node with specified . @@ -982,6 +987,8 @@ class Graph { // the parent graph if this is a subgraph. Graph* parent_graph_; + // the node containing the graph if parent_graph_ is not nullptr + const Node* parent_node_; // NodeArgs that come from outer scope. Used when building a graph so that // these don't get recorded as graph inputs in the GraphProto. diff --git a/include/onnxruntime/core/graph/graph_viewer.h b/include/onnxruntime/core/graph/graph_viewer.h index 8d6530719d..320d0fb6e1 100644 --- a/include/onnxruntime/core/graph/graph_viewer.h +++ b/include/onnxruntime/core/graph/graph_viewer.h @@ -114,6 +114,9 @@ class GraphViewer { */ bool IsConstantInitializer(const std::string& name, bool check_outer_scope) const; + /** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */ + const Node* ParentNode() const noexcept { return graph_->ParentNode(); } + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer); diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index 2a9f706110..9521fff6f6 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -15,6 +15,16 @@ #include "core/framework/op_kernel_context_internal.h" #include "core/framework/utils.h" +// Define this symbol to create Concurrency Visualizer markers. +// See https://docs.microsoft.com/en-us/visualstudio/profiling/concurrency-visualizer-sdk +// You will need to install Concurrency Visualizer and add the SDK to the project that compiles this file +// via Analyze->Concurrency Visualizer->Add SDK to Project... +// #define CONCURRENCY_VISUALIZER +#ifdef CONCURRENCY_VISUALIZER +#include +using namespace Concurrency; +#endif + namespace onnxruntime { static Status ReleaseNodeMLValues(ExecutionFrame& frame, @@ -45,6 +55,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: // uncomment the line below to dump execution plan //std::cout << std::make_pair(p_seq_exec_plan, &session_state) << "\n"; + const auto* graph_viewer = session_state.GetGraphViewer(); + +#ifdef CONCURRENCY_VISUALIZER + // need unique name for the series. number of nodes should be good enough for a subgraph + char series_name[MaxSeriesNameLengthInChars] = "MainGraph"; + if (graph_viewer->IsSubgraph()) { + auto s = graph_viewer->ParentNode()->Name().substr(0, MaxSeriesNameLengthInChars - 1); + std::copy(s.cbegin(), s.cend(), series_name); + } + + diagnostic::marker_series series(series_name); +#endif for (const auto& node_exec_plan : exec_plan_vec) { if (terminate_flag_) { @@ -53,12 +75,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: } auto node_index = node_exec_plan.node_index; + const auto& node = *graph_viewer->GetNode(node_exec_plan.node_index); + +#ifdef CONCURRENCY_VISUALIZER + series.write_flag(node.Name().c_str()); +#endif + auto p_op_kernel = session_state.GetKernel(node_index); // if a kernel has been added in the session state, it better be NON-null. if (p_op_kernel == nullptr) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Got nullptr from GetKernel for node: ", - session_state.GetGraphViewer()->GetNode(node_index)->Name()); + node.Name()); // construct OpKernelContext // TODO: log kernel inputs? @@ -117,17 +145,24 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: kernel_begin_time = session_state.Profiler().StartTime(); } - const auto& compute_status = p_op_kernel->Compute(&op_kernel_context); - if (!compute_status.IsOK()) { - std::ostringstream ss; - ss << "Non-zero status code returned while running Node: " << - p_op_kernel->Node().Name() << - " Status Message: " << - compute_status.ErrorMessage(); - const auto msg_string = ss.str(); - LOGS(logger, ERROR) << msg_string; - return Status(compute_status.Category(), compute_status.Code(), msg_string); +#ifdef CONCURRENCY_VISUALIZER + { + diagnostic::span span(series, "%s.%d", node.OpType().c_str(), node.Index()); +#endif + + Status compute_status = p_op_kernel->Compute(&op_kernel_context); + if (!compute_status.IsOK()) { + std::ostringstream ss; + ss << "Non-zero status code returned while running Node: " << node.Name() + << " Status Message: " << compute_status.ErrorMessage(); + const auto msg_string = ss.str(); + LOGS(logger, ERROR) << msg_string; + return Status(compute_status.Category(), compute_status.Code(), msg_string); + } + +#ifdef CONCURRENCY_VISUALIZER } +#endif if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 4afff7892e..eb61b8c3a7 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -415,7 +415,7 @@ void Node::CreateSubgraph(const std::string& attr_name) { if (attr != attributes_.cend() && utils::HasGraph(attr->second)) { GraphProto& mutable_graph = *attr->second.mutable_g(); - std::unique_ptr subgraph{new Graph(*graph_, mutable_graph)}; + std::unique_ptr subgraph{new Graph(*graph_, *this, mutable_graph)}; attr_to_subgraph_map_.insert({std::string{attr_name}, gsl::not_null{subgraph.get()}}); subgraphs_.push_back(std::move(subgraph)); } @@ -624,19 +624,20 @@ Graph::Graph(GraphProto* graph_proto, const std::unordered_map& domain_to_version, Version ir_version, IOnnxRuntimeOpSchemaCollectionPtr schema_registry, - const std::unordered_map& model_functions) : Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, model_functions) {} + const std::unordered_map& model_functions) + : Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, nullptr, model_functions) {} Graph::Graph(GraphProto* graph_proto, const std::unordered_map& domain_to_version, Version ir_version, - IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph, + IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph, const Node* parent_node, const std::unordered_map& model_functions) : graph_proto_{graph_proto}, schema_registry_(schema_registry), graph_resolve_needed_(true), - domain_to_version_(domain_to_version), model_functions_(model_functions), ir_version_(ir_version), - parent_graph_{parent_graph} { + parent_graph_{parent_graph}, + parent_node_{parent_node} { ORT_ENFORCE(graph_proto != nullptr, "graph_proto cannot be null"); ArgNameToTypeMap name_to_type_map; @@ -709,10 +710,11 @@ Graph::Graph(GraphProto* graph_proto, const std::unordered_map } } -Graph::Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto) +Graph::Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto) : Graph(&subgraph_proto, parent_graph.DomainToVersionMap(), parent_graph.IrVersion(), parent_graph.schema_registry_, - &parent_graph) { + &parent_graph, + &parent_node) { } Status Graph::VerifyNoDuplicateName() { diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc index d55f2d4f20..5fe7c0f9aa 100644 --- a/onnxruntime/test/perftest/performance_runner.cc +++ b/onnxruntime/test/perftest/performance_runner.cc @@ -51,6 +51,7 @@ Status PerformanceRunner::Run() { // TODO: start profiling // if (!performance_test_config_.run_config.profile_file.empty()) + performance_result_.start_ = std::chrono::high_resolution_clock::now(); std::unique_ptr p_ICPUUsage = utils::CreateICPUUsage(); switch (performance_test_config_.run_config.test_mode) { @@ -63,16 +64,20 @@ Status PerformanceRunner::Run() { default: return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "unknown test mode."); } + performance_result_.end_ = std::chrono::high_resolution_clock::now(); performance_result_.average_CPU_usage = p_ICPUUsage->GetUsage(); performance_result_.peak_workingset_size = utils::GetPeakWorkingSetSize(); // TODO: end profiling // if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling(); + std::chrono::duration duration = performance_result_.end_ - performance_result_.start_; - std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl + std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl // sum of time taken by each request << "Total iterations:" << performance_result_.time_costs.size() << std::endl - << "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl; + << "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl + // Time between start and end of run. Less than Total time cost when running requests in parallel. + << "Total run time:" << duration.count() << " s" << std::endl; return Status::OK(); } @@ -129,32 +134,38 @@ Status PerformanceRunner::RunParallelDuration() { } Status PerformanceRunner::ForkJoinRepeat() { - // Adding trivially simple parallelization to the repeated times test will simply perform - // m instances of n parallel invocations with a synchronized join after each invocation. - // TODO: When the thread pool implementation is done, redo if it has join semantics. - auto tpool = GetDefaultThreadPool(Env::Default()); - std::atomic counter = {0}; + const auto& run_config = performance_test_config_.run_config; + + // create a threadpool with one thread per concurrent request + auto tpool = std::make_unique(run_config.concurrent_session_runs); + std::atomic counter{0}, requests{0}; std::mutex m; std::condition_variable cv; - for (size_t ite = 0; ite < performance_test_config_.run_config.repeated_times; ite++) { - // Fork - counter.load(std::memory_order_seq_cst); - for (size_t i = 0; i != performance_test_config_.run_config.concurrent_session_runs; ++i) { - counter++; - tpool->Schedule([this, &counter, &m, &cv]() { - session_->ThreadSafeRun(); - // Simplified version of Eigen::Barrier - std::lock_guard lg(m); - counter--; - cv.notify_all(); - }); - } + // Fork + counter.load(std::memory_order_seq_cst); + requests.load(std::memory_order_seq_cst); - //Join - std::unique_lock lock(m); - cv.wait(lock, [&counter]() { return counter == 0; }); + for (size_t i = 0; i != run_config.concurrent_session_runs; ++i) { + counter++; + tpool->Schedule([this, &counter, &requests, &m, &cv, &run_config]() { + while (requests++ < static_cast(run_config.repeated_times)) { + auto status = RunOneIteration(); + if (!status.IsOK()) + std::cerr << status.ErrorMessage(); + } + + // Simplified version of Eigen::Barrier + std::lock_guard lg(m); + counter--; + cv.notify_all(); + }); } + + //Join + std::unique_lock lock(m); + cv.wait(lock, [&counter]() { return counter == 0; }); + return Status::OK(); } diff --git a/onnxruntime/test/perftest/performance_runner.h b/onnxruntime/test/perftest/performance_runner.h index 8d9cf1d808..bd3d03e804 100644 --- a/onnxruntime/test/perftest/performance_runner.h +++ b/onnxruntime/test/perftest/performance_runner.h @@ -10,6 +10,7 @@ #include #include #include +#include // onnxruntime dependencies #include #include @@ -27,6 +28,8 @@ namespace onnxruntime { namespace perftest { struct PerformanceResult { + std::chrono::time_point start_; + std::chrono::time_point end_; size_t peak_workingset_size{0}; short average_CPU_usage{0}; double total_time_cost{0}; @@ -58,13 +61,18 @@ struct PerformanceResult { std::sort(sorted_time.begin(), sorted_time.end()); outfile << std::endl; - outfile << "Min Latency is " << sorted_time[0] << "sec" << std::endl; - outfile << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl; - outfile << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl; - outfile << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl; - outfile << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl; - outfile << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl; - outfile << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl; + auto output_stats = [&](std::ostream& ostream) { + ostream << "Min Latency is " << sorted_time[0] << "sec" << std::endl; + ostream << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl; + ostream << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl; + ostream << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl; + ostream << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl; + ostream << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl; + ostream << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl; + }; + + output_stats(outfile); + output_stats(std::cout); } outfile.close();