diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 1901822011..39707cd944 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -762,12 +762,16 @@ class Graph {
   /** Returns the mutable parent graph if this is a subgraph */
   Graph* MutableParentGraph() { return parent_graph_; }
 
+  /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
+  const Node* ParentNode() const { return parent_node_; }
+
   /** Construct a Graph instance for a subgraph that is created from a GraphProto attribute in a Node.
   Inherits some properties from the parent graph.
-  @param parent_graph The Graph containing the Node which has a GraphProto attribute.
+  @param parent_graph The Graph containing the Node that has the GraphProto attribute.
+  @param parent_node The Node that has the GraphProto attribute.
   @param subgraph_proto The GraphProto from the Node attribute.
   */
-  Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto);
+  Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto);
 
   virtual ~Graph();
 
@@ -794,6 +798,7 @@ class Graph {
         Version ir_version,
         IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
         Graph* parent_graph,
+        const Node* parent_node,
         const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions = {});
 
   // Add node with specified <node_proto>.
@@ -982,6 +987,8 @@ class Graph {
 
   // the parent graph if this is a subgraph.
   Graph* parent_graph_;
+  // the node containing the graph if parent_graph_ is not nullptr
+  const Node* parent_node_;
 
   // NodeArgs that come from outer scope. Used when building a graph so that
   // these don't get recorded as graph inputs in the GraphProto.
diff --git a/include/onnxruntime/core/graph/graph_viewer.h b/include/onnxruntime/core/graph/graph_viewer.h
index 8d6530719d..320d0fb6e1 100644
--- a/include/onnxruntime/core/graph/graph_viewer.h
+++ b/include/onnxruntime/core/graph/graph_viewer.h
@@ -114,6 +114,9 @@ class GraphViewer {
   */
   bool IsConstantInitializer(const std::string& name, bool check_outer_scope) const;
 
+  /** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
+  const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);
 
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 2a9f706110..9521fff6f6 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -15,6 +15,16 @@
 #include "core/framework/op_kernel_context_internal.h"
 #include "core/framework/utils.h"
 
+// Define this symbol to create Concurrency Visualizer markers.
+// See https://docs.microsoft.com/en-us/visualstudio/profiling/concurrency-visualizer-sdk
+// You will need to install Concurrency Visualizer and add the SDK to the project that compiles this file
+// via Analyze->Concurrency Visualizer->Add SDK to Project...
+// #define CONCURRENCY_VISUALIZER
+#ifdef CONCURRENCY_VISUALIZER
+#include <cvmarkersobj.h>
+using namespace Concurrency;
+#endif
+
 namespace onnxruntime {
 
 static Status ReleaseNodeMLValues(ExecutionFrame& frame,
@@ -45,6 +55,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
 
   // uncomment the line below to dump execution plan
   //std::cout << std::make_pair(p_seq_exec_plan, &session_state) << "\n";
+  const auto* graph_viewer = session_state.GetGraphViewer();
+
+#ifdef CONCURRENCY_VISUALIZER
+  // need unique name for the series. number of nodes should be good enough for a subgraph
+  char series_name[MaxSeriesNameLengthInChars] = "MainGraph";
+  if (graph_viewer->IsSubgraph()) {
+    auto s = graph_viewer->ParentNode()->Name().substr(0, MaxSeriesNameLengthInChars - 1);
+    std::copy(s.cbegin(), s.cend(), series_name);
+  }
+
+  diagnostic::marker_series series(series_name);
+#endif
 
   for (const auto& node_exec_plan : exec_plan_vec) {
     if (terminate_flag_) {
@@ -53,12 +75,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
     }
 
     auto node_index = node_exec_plan.node_index;
+    const auto& node = *graph_viewer->GetNode(node_exec_plan.node_index);
+
+#ifdef CONCURRENCY_VISUALIZER
+    series.write_flag(node.Name().c_str());
+#endif
+
     auto p_op_kernel = session_state.GetKernel(node_index);
 
     // if a kernel has been added in the session state, it better be NON-null.
     if (p_op_kernel == nullptr)
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Got nullptr from GetKernel for node: ",
-                             session_state.GetGraphViewer()->GetNode(node_index)->Name());
+                             node.Name());
 
     // construct OpKernelContext
     // TODO: log kernel inputs?
@@ -117,17 +145,24 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
       kernel_begin_time = session_state.Profiler().StartTime();
     }
 
-    const auto& compute_status = p_op_kernel->Compute(&op_kernel_context);
-    if (!compute_status.IsOK()) {
-      std::ostringstream ss;
-      ss << "Non-zero status code returned while running Node: " <<
-            p_op_kernel->Node().Name() <<
-            " Status Message: " <<
-            compute_status.ErrorMessage();
-      const auto msg_string = ss.str();
-      LOGS(logger, ERROR) << msg_string;
-      return Status(compute_status.Category(), compute_status.Code(), msg_string);
+#ifdef CONCURRENCY_VISUALIZER
+    {
+      diagnostic::span span(series, "%s.%d", node.OpType().c_str(), node.Index());
+#endif
+
+      Status compute_status = p_op_kernel->Compute(&op_kernel_context);
+      if (!compute_status.IsOK()) {
+        std::ostringstream ss;
+        ss << "Non-zero status code returned while running Node: " << node.Name()
+           << " Status Message: " << compute_status.ErrorMessage();
+        const auto msg_string = ss.str();
+        LOGS(logger, ERROR) << msg_string;
+        return Status(compute_status.Category(), compute_status.Code(), msg_string);
+      }
+
+#ifdef CONCURRENCY_VISUALIZER
     }
+#endif
 
     if (is_profiler_enabled) {
       session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 4afff7892e..eb61b8c3a7 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -415,7 +415,7 @@ void Node::CreateSubgraph(const std::string& attr_name) {
 
   if (attr != attributes_.cend() && utils::HasGraph(attr->second)) {
     GraphProto& mutable_graph = *attr->second.mutable_g();
-    std::unique_ptr<Graph> subgraph{new Graph(*graph_, mutable_graph)};
+    std::unique_ptr<Graph> subgraph{new Graph(*graph_, *this, mutable_graph)};
     attr_to_subgraph_map_.insert({std::string{attr_name}, gsl::not_null<Graph*>{subgraph.get()}});
     subgraphs_.push_back(std::move(subgraph));
   }
@@ -624,19 +624,20 @@ Graph::Graph(GraphProto* graph_proto,
              const std::unordered_map<std::string, int>& domain_to_version,
              Version ir_version,
              IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
-             const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions) : Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, model_functions) {}
+             const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions)
+    : Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, nullptr, model_functions) {}
 
 Graph::Graph(GraphProto* graph_proto, const std::unordered_map<std::string, int>& domain_to_version, Version ir_version,
-             IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph,
+             IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph, const Node* parent_node,
              const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions)
     : graph_proto_{graph_proto},
       schema_registry_(schema_registry),
       graph_resolve_needed_(true),
-
       domain_to_version_(domain_to_version),
       model_functions_(model_functions),
       ir_version_(ir_version),
-      parent_graph_{parent_graph} {
+      parent_graph_{parent_graph},
+      parent_node_{parent_node} {
   ORT_ENFORCE(graph_proto != nullptr, "graph_proto cannot be null");
   ArgNameToTypeMap name_to_type_map;
 
@@ -709,10 +710,11 @@ Graph::Graph(GraphProto* graph_proto, const std::unordered_map<std::string, int>
   }
 }
 
-Graph::Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto)
+Graph::Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto)
     : Graph(&subgraph_proto,
             parent_graph.DomainToVersionMap(), parent_graph.IrVersion(), parent_graph.schema_registry_,
-            &parent_graph) {
+            &parent_graph,
+            &parent_node) {
 }
 
 Status Graph::VerifyNoDuplicateName() {
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index d55f2d4f20..5fe7c0f9aa 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -51,6 +51,7 @@ Status PerformanceRunner::Run() {
 
   // TODO: start profiling
   // if (!performance_test_config_.run_config.profile_file.empty())
+  performance_result_.start_ = std::chrono::high_resolution_clock::now();
 
   std::unique_ptr<utils::ICPUUsage> p_ICPUUsage = utils::CreateICPUUsage();
   switch (performance_test_config_.run_config.test_mode) {
@@ -63,16 +64,20 @@ Status PerformanceRunner::Run() {
     default:
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "unknown test mode.");
   }
+  performance_result_.end_ = std::chrono::high_resolution_clock::now();
 
   performance_result_.average_CPU_usage = p_ICPUUsage->GetUsage();
   performance_result_.peak_workingset_size = utils::GetPeakWorkingSetSize();
 
   // TODO: end profiling
   // if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
+  std::chrono::duration<double> duration = performance_result_.end_ - performance_result_.start_;
 
-  std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl
+  std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl  // sum of time taken by each request
             << "Total iterations:" << performance_result_.time_costs.size() << std::endl
-            << "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl;
+            << "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl
+            // Time between start and end of run. Less than Total time cost when running requests in parallel.
+            << "Total run time:" << duration.count() << " s" << std::endl;
   return Status::OK();
 }
 
@@ -129,32 +134,38 @@ Status PerformanceRunner::RunParallelDuration() {
 }
 
 Status PerformanceRunner::ForkJoinRepeat() {
-  // Adding trivially simple parallelization to the repeated times test will simply perform
-  // m instances of n parallel invocations with a synchronized join after each invocation.
-  // TODO: When the thread pool implementation is done, redo if it has join semantics.
-  auto tpool = GetDefaultThreadPool(Env::Default());
-  std::atomic<int> counter = {0};
+  const auto& run_config = performance_test_config_.run_config;
+
+  // create a threadpool with one thread per concurrent request
+  auto tpool = std::make_unique<DefaultThreadPoolType>(run_config.concurrent_session_runs);
+  std::atomic<int> counter{0}, requests{0};
   std::mutex m;
   std::condition_variable cv;
 
-  for (size_t ite = 0; ite < performance_test_config_.run_config.repeated_times; ite++) {
-    // Fork
-    counter.load(std::memory_order_seq_cst);
-    for (size_t i = 0; i != performance_test_config_.run_config.concurrent_session_runs; ++i) {
-      counter++;
-      tpool->Schedule([this, &counter, &m, &cv]() {
-        session_->ThreadSafeRun();
-        // Simplified version of Eigen::Barrier
-        std::lock_guard<std::mutex> lg(m);
-        counter--;
-        cv.notify_all();
-      });
-    }
+  // Fork
+  counter.load(std::memory_order_seq_cst);
+  requests.load(std::memory_order_seq_cst);
 
-    //Join
-    std::unique_lock<std::mutex> lock(m);
-    cv.wait(lock, [&counter]() { return counter == 0; });
+  for (size_t i = 0; i != run_config.concurrent_session_runs; ++i) {
+    counter++;
+    tpool->Schedule([this, &counter, &requests, &m, &cv, &run_config]() {
+      while (requests++ < static_cast<int>(run_config.repeated_times)) {
+        auto status = RunOneIteration<false>();
+        if (!status.IsOK())
+          std::cerr << status.ErrorMessage();
+      }
+
+      // Simplified version of Eigen::Barrier
+      std::lock_guard<std::mutex> lg(m);
+      counter--;
+      cv.notify_all();
+    });
   }
+
+  //Join
+  std::unique_lock<std::mutex> lock(m);
+  cv.wait(lock, [&counter]() { return counter == 0; });
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/test/perftest/performance_runner.h b/onnxruntime/test/perftest/performance_runner.h
index 8d9cf1d808..bd3d03e804 100644
--- a/onnxruntime/test/perftest/performance_runner.h
+++ b/onnxruntime/test/perftest/performance_runner.h
@@ -10,6 +10,7 @@
 #include <mutex>
 #include <iostream>
 #include <random>
+#include <chrono>
 // onnxruntime dependencies
 #include <core/common/common.h>
 #include <core/common/status.h>
@@ -27,6 +28,8 @@ namespace onnxruntime {
 namespace perftest {
 
 struct PerformanceResult {
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_;
+  std::chrono::time_point<std::chrono::high_resolution_clock> end_;
   size_t peak_workingset_size{0};
   short average_CPU_usage{0};
   double total_time_cost{0};
@@ -58,13 +61,18 @@ struct PerformanceResult {
       std::sort(sorted_time.begin(), sorted_time.end());
 
       outfile << std::endl;
-      outfile << "Min Latency is " << sorted_time[0] << "sec" << std::endl;
-      outfile << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl;
-      outfile << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl;
-      outfile << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl;
-      outfile << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl;
-      outfile << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl;
-      outfile << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl;
+      auto output_stats = [&](std::ostream& ostream) {
+        ostream << "Min Latency is " << sorted_time[0] << "sec" << std::endl;
+        ostream << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl;
+        ostream << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl;
+        ostream << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl;
+        ostream << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl;
+        ostream << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl;
+        ostream << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl;
+      };
+
+      output_stats(outfile);
+      output_stats(std::cout);
     }
 
     outfile.close();