General performance testing tooling improvements (#1577)

* Miscellaneous updates to help with perf testing
2026-07-23 19:32:23 +00:00 · 2019-09-11 19:46:59 +10:00 · 2019-09-11 19:46:59 +10:00 · 3b7f047a49
commit 3b7f047a49
parent 6586afc8eb
6 changed files with 116 additions and 50 deletions
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@ -762,12 +762,16 @@ class Graph {
  /** Returns the mutable parent graph if this is a subgraph */
  Graph* MutableParentGraph() { return parent_graph_; }

+  /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
+  const Node* ParentNode() const { return parent_node_; }
+
  /** Construct a Graph instance for a subgraph that is created from a GraphProto attribute in a Node.
  Inherits some properties from the parent graph.
-  @param parent_graph The Graph containing the Node which has a GraphProto attribute.
+  @param parent_graph The Graph containing the Node that has the GraphProto attribute.
+  @param parent_node The Node that has the GraphProto attribute.
  @param subgraph_proto The GraphProto from the Node attribute.
  */
-  Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto);
+  Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto);

  virtual ~Graph();

@ -794,6 +798,7 @@ class Graph {
        Version ir_version,
        IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
        Graph* parent_graph,
+        const Node* parent_node,
        const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions = {});

  // Add node with specified <node_proto>.
@ -982,6 +987,8 @@ class Graph {

  // the parent graph if this is a subgraph.
  Graph* parent_graph_;
+  // the node containing the graph if parent_graph_ is not nullptr
+  const Node* parent_node_;

  // NodeArgs that come from outer scope. Used when building a graph so that
  // these don't get recorded as graph inputs in the GraphProto.
--- a/include/onnxruntime/core/graph/graph_viewer.h
+++ b/include/onnxruntime/core/graph/graph_viewer.h
@ -114,6 +114,9 @@ class GraphViewer {
  */
  bool IsConstantInitializer(const std::string& name, bool check_outer_scope) const;

+  /** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
+  const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
+
 private:
  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);

--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@ -15,6 +15,16 @@
 #include "core/framework/op_kernel_context_internal.h"
 #include "core/framework/utils.h"

+// Define this symbol to create Concurrency Visualizer markers.
+// See https://docs.microsoft.com/en-us/visualstudio/profiling/concurrency-visualizer-sdk
+// You will need to install Concurrency Visualizer and add the SDK to the project that compiles this file
+// via Analyze->Concurrency Visualizer->Add SDK to Project...
+// #define CONCURRENCY_VISUALIZER
+#ifdef CONCURRENCY_VISUALIZER
+#include <cvmarkersobj.h>
+using namespace Concurrency;
+#endif
+
 namespace onnxruntime {

 static Status ReleaseNodeMLValues(ExecutionFrame& frame,
@ -45,6 +55,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:

  // uncomment the line below to dump execution plan
  //std::cout << std::make_pair(p_seq_exec_plan, &session_state) << "\n";
+  const auto* graph_viewer = session_state.GetGraphViewer();
+
+#ifdef CONCURRENCY_VISUALIZER
+  // need unique name for the series. number of nodes should be good enough for a subgraph
+  char series_name[MaxSeriesNameLengthInChars] = "MainGraph";
+  if (graph_viewer->IsSubgraph()) {
+    auto s = graph_viewer->ParentNode()->Name().substr(0, MaxSeriesNameLengthInChars - 1);
+    std::copy(s.cbegin(), s.cend(), series_name);
+  }
+
+  diagnostic::marker_series series(series_name);
+#endif

  for (const auto& node_exec_plan : exec_plan_vec) {
    if (terminate_flag_) {
@ -53,12 +75,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
    }

    auto node_index = node_exec_plan.node_index;
+    const auto& node = *graph_viewer->GetNode(node_exec_plan.node_index);
+
+#ifdef CONCURRENCY_VISUALIZER
+    series.write_flag(node.Name().c_str());
+#endif
+
    auto p_op_kernel = session_state.GetKernel(node_index);

    // if a kernel has been added in the session state, it better be NON-null.
    if (p_op_kernel == nullptr)
      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Got nullptr from GetKernel for node: ",
-                             session_state.GetGraphViewer()->GetNode(node_index)->Name());
+                             node.Name());

    // construct OpKernelContext
    // TODO: log kernel inputs?
@ -117,17 +145,24 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
      kernel_begin_time = session_state.Profiler().StartTime();
    }

-    const auto& compute_status = p_op_kernel->Compute(&op_kernel_context);
-    if (!compute_status.IsOK()) {
-      std::ostringstream ss;
-      ss << "Non-zero status code returned while running Node: " <<
-            p_op_kernel->Node().Name() <<
-            " Status Message: " <<
-            compute_status.ErrorMessage();
-      const auto msg_string = ss.str();
-      LOGS(logger, ERROR) << msg_string;
-      return Status(compute_status.Category(), compute_status.Code(), msg_string);
+#ifdef CONCURRENCY_VISUALIZER
+    {
+      diagnostic::span span(series, "%s.%d", node.OpType().c_str(), node.Index());
+#endif
+
+      Status compute_status = p_op_kernel->Compute(&op_kernel_context);
+      if (!compute_status.IsOK()) {
+        std::ostringstream ss;
+        ss << "Non-zero status code returned while running Node: " << node.Name()
+           << " Status Message: " << compute_status.ErrorMessage();
+        const auto msg_string = ss.str();
+        LOGS(logger, ERROR) << msg_string;
+        return Status(compute_status.Category(), compute_status.Code(), msg_string);
+      }
+
+#ifdef CONCURRENCY_VISUALIZER
    }
+#endif

    if (is_profiler_enabled) {
      session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@ -415,7 +415,7 @@ void Node::CreateSubgraph(const std::string& attr_name) {

  if (attr != attributes_.cend() && utils::HasGraph(attr->second)) {
    GraphProto& mutable_graph = *attr->second.mutable_g();
-    std::unique_ptr<Graph> subgraph{new Graph(*graph_, mutable_graph)};
+    std::unique_ptr<Graph> subgraph{new Graph(*graph_, *this, mutable_graph)};
    attr_to_subgraph_map_.insert({std::string{attr_name}, gsl::not_null<Graph*>{subgraph.get()}});
    subgraphs_.push_back(std::move(subgraph));
  }
@ -624,19 +624,20 @@ Graph::Graph(GraphProto* graph_proto,
             const std::unordered_map<std::string, int>& domain_to_version,
             Version ir_version,
             IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
-             const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions) : Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, model_functions) {}
+             const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions)
+    : Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, nullptr, model_functions) {}

 Graph::Graph(GraphProto* graph_proto, const std::unordered_map<std::string, int>& domain_to_version, Version ir_version,
-             IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph,
+             IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph, const Node* parent_node,
             const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions)
    : graph_proto_{graph_proto},
      schema_registry_(schema_registry),
      graph_resolve_needed_(true),
-
      domain_to_version_(domain_to_version),
      model_functions_(model_functions),
      ir_version_(ir_version),
-      parent_graph_{parent_graph} {
+      parent_graph_{parent_graph},
+      parent_node_{parent_node} {
  ORT_ENFORCE(graph_proto != nullptr, "graph_proto cannot be null");
  ArgNameToTypeMap name_to_type_map;

@ -709,10 +710,11 @@ Graph::Graph(GraphProto* graph_proto, const std::unordered_map<std::string, int>
  }
 }

-Graph::Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto)
+Graph::Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto)
    : Graph(&subgraph_proto,
            parent_graph.DomainToVersionMap(), parent_graph.IrVersion(), parent_graph.schema_registry_,
-            &parent_graph) {
+            &parent_graph,
+            &parent_node) {
 }

 Status Graph::VerifyNoDuplicateName() {
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@ -51,6 +51,7 @@ Status PerformanceRunner::Run() {

  // TODO: start profiling
  // if (!performance_test_config_.run_config.profile_file.empty())
+  performance_result_.start_ = std::chrono::high_resolution_clock::now();

  std::unique_ptr<utils::ICPUUsage> p_ICPUUsage = utils::CreateICPUUsage();
  switch (performance_test_config_.run_config.test_mode) {
@ -63,16 +64,20 @@ Status PerformanceRunner::Run() {
    default:
      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "unknown test mode.");
  }
+  performance_result_.end_ = std::chrono::high_resolution_clock::now();

  performance_result_.average_CPU_usage = p_ICPUUsage->GetUsage();
  performance_result_.peak_workingset_size = utils::GetPeakWorkingSetSize();

  // TODO: end profiling
  // if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
+  std::chrono::duration<double> duration = performance_result_.end_ - performance_result_.start_;

-  std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl
+  std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl  // sum of time taken by each request
            << "Total iterations:" << performance_result_.time_costs.size() << std::endl
-            << "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl;
+            << "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl
+            // Time between start and end of run. Less than Total time cost when running requests in parallel.
+            << "Total run time:" << duration.count() << " s" << std::endl;
  return Status::OK();
 }

@ -129,32 +134,38 @@ Status PerformanceRunner::RunParallelDuration() {
 }

 Status PerformanceRunner::ForkJoinRepeat() {
-  // Adding trivially simple parallelization to the repeated times test will simply perform
-  // m instances of n parallel invocations with a synchronized join after each invocation.
-  // TODO: When the thread pool implementation is done, redo if it has join semantics.
-  auto tpool = GetDefaultThreadPool(Env::Default());
-  std::atomic<int> counter = {0};
+  const auto& run_config = performance_test_config_.run_config;
+
+  // create a threadpool with one thread per concurrent request
+  auto tpool = std::make_unique<DefaultThreadPoolType>(run_config.concurrent_session_runs);
+  std::atomic<int> counter{0}, requests{0};
  std::mutex m;
  std::condition_variable cv;

-  for (size_t ite = 0; ite < performance_test_config_.run_config.repeated_times; ite++) {
-    // Fork
-    counter.load(std::memory_order_seq_cst);
-    for (size_t i = 0; i != performance_test_config_.run_config.concurrent_session_runs; ++i) {
-      counter++;
-      tpool->Schedule([this, &counter, &m, &cv]() {
-        session_->ThreadSafeRun();
-        // Simplified version of Eigen::Barrier
-        std::lock_guard<std::mutex> lg(m);
-        counter--;
-        cv.notify_all();
-      });
-    }
+  // Fork
+  counter.load(std::memory_order_seq_cst);
+  requests.load(std::memory_order_seq_cst);

-    //Join
-    std::unique_lock<std::mutex> lock(m);
-    cv.wait(lock, [&counter]() { return counter == 0; });
+  for (size_t i = 0; i != run_config.concurrent_session_runs; ++i) {
+    counter++;
+    tpool->Schedule([this, &counter, &requests, &m, &cv, &run_config]() {
+      while (requests++ < static_cast<int>(run_config.repeated_times)) {
+        auto status = RunOneIteration<false>();
+        if (!status.IsOK())
+          std::cerr << status.ErrorMessage();
+      }
+
+      // Simplified version of Eigen::Barrier
+      std::lock_guard<std::mutex> lg(m);
+      counter--;
+      cv.notify_all();
+    });
  }
+
+  //Join
+  std::unique_lock<std::mutex> lock(m);
+  cv.wait(lock, [&counter]() { return counter == 0; });
+
  return Status::OK();
 }

--- a/onnxruntime/test/perftest/performance_runner.h
+++ b/onnxruntime/test/perftest/performance_runner.h
@ -10,6 +10,7 @@
 #include <mutex>
 #include <iostream>
 #include <random>
+#include <chrono>
 // onnxruntime dependencies
 #include <core/common/common.h>
 #include <core/common/status.h>
@ -27,6 +28,8 @@ namespace onnxruntime {
 namespace perftest {

 struct PerformanceResult {
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_;
+  std::chrono::time_point<std::chrono::high_resolution_clock> end_;
  size_t peak_workingset_size{0};
  short average_CPU_usage{0};
  double total_time_cost{0};
@ -58,13 +61,18 @@ struct PerformanceResult {
      std::sort(sorted_time.begin(), sorted_time.end());

      outfile << std::endl;
-      outfile << "Min Latency is " << sorted_time[0] << "sec" << std::endl;
-      outfile << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl;
-      outfile << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl;
-      outfile << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl;
-      outfile << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl;
-      outfile << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl;
-      outfile << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl;
+      auto output_stats = [&](std::ostream& ostream) {
+        ostream << "Min Latency is " << sorted_time[0] << "sec" << std::endl;
+        ostream << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl;
+        ostream << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl;
+        ostream << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl;
+        ostream << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl;
+        ostream << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl;
+        ostream << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl;
+      };
+
+      output_stats(outfile);
+      output_stats(std::cout);
    }

    outfile.close();