General performance testing tooling improvements (#1577)

* Miscellaneous updates to help with perf testing
This commit is contained in:
Scott McKay 2019-09-11 19:46:59 +10:00 committed by GitHub
parent 6586afc8eb
commit 3b7f047a49
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 116 additions and 50 deletions

View file

@ -762,12 +762,16 @@ class Graph {
/** Returns the mutable parent graph if this is a subgraph */
Graph* MutableParentGraph() { return parent_graph_; }
/** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
const Node* ParentNode() const { return parent_node_; }
/** Construct a Graph instance for a subgraph that is created from a GraphProto attribute in a Node.
Inherits some properties from the parent graph.
@param parent_graph The Graph containing the Node which has a GraphProto attribute.
@param parent_graph The Graph containing the Node that has the GraphProto attribute.
@param parent_node The Node that has the GraphProto attribute.
@param subgraph_proto The GraphProto from the Node attribute.
*/
Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto);
Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto);
virtual ~Graph();
@ -794,6 +798,7 @@ class Graph {
Version ir_version,
IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
Graph* parent_graph,
const Node* parent_node,
const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions = {});
// Add node with specified <node_proto>.
@ -982,6 +987,8 @@ class Graph {
// the parent graph if this is a subgraph.
Graph* parent_graph_;
// the node containing the graph if parent_graph_ is not nullptr
const Node* parent_node_;
// NodeArgs that come from outer scope. Used when building a graph so that
// these don't get recorded as graph inputs in the GraphProto.

View file

@ -114,6 +114,9 @@ class GraphViewer {
*/
bool IsConstantInitializer(const std::string& name, bool check_outer_scope) const;
/** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);

View file

@ -15,6 +15,16 @@
#include "core/framework/op_kernel_context_internal.h"
#include "core/framework/utils.h"
// Define this symbol to create Concurrency Visualizer markers.
// See https://docs.microsoft.com/en-us/visualstudio/profiling/concurrency-visualizer-sdk
// You will need to install Concurrency Visualizer and add the SDK to the project that compiles this file
// via Analyze->Concurrency Visualizer->Add SDK to Project...
// #define CONCURRENCY_VISUALIZER
#ifdef CONCURRENCY_VISUALIZER
#include <cvmarkersobj.h>
using namespace Concurrency;
#endif
namespace onnxruntime {
static Status ReleaseNodeMLValues(ExecutionFrame& frame,
@ -45,6 +55,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
// uncomment the line below to dump execution plan
//std::cout << std::make_pair(p_seq_exec_plan, &session_state) << "\n";
const auto* graph_viewer = session_state.GetGraphViewer();
#ifdef CONCURRENCY_VISUALIZER
// need unique name for the series. number of nodes should be good enough for a subgraph
char series_name[MaxSeriesNameLengthInChars] = "MainGraph";
if (graph_viewer->IsSubgraph()) {
auto s = graph_viewer->ParentNode()->Name().substr(0, MaxSeriesNameLengthInChars - 1);
std::copy(s.cbegin(), s.cend(), series_name);
}
diagnostic::marker_series series(series_name);
#endif
for (const auto& node_exec_plan : exec_plan_vec) {
if (terminate_flag_) {
@ -53,12 +75,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
}
auto node_index = node_exec_plan.node_index;
const auto& node = *graph_viewer->GetNode(node_exec_plan.node_index);
#ifdef CONCURRENCY_VISUALIZER
series.write_flag(node.Name().c_str());
#endif
auto p_op_kernel = session_state.GetKernel(node_index);
// if a kernel has been added in the session state, it better be NON-null.
if (p_op_kernel == nullptr)
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Got nullptr from GetKernel for node: ",
session_state.GetGraphViewer()->GetNode(node_index)->Name());
node.Name());
// construct OpKernelContext
// TODO: log kernel inputs?
@ -117,17 +145,24 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
kernel_begin_time = session_state.Profiler().StartTime();
}
const auto& compute_status = p_op_kernel->Compute(&op_kernel_context);
if (!compute_status.IsOK()) {
std::ostringstream ss;
ss << "Non-zero status code returned while running Node: " <<
p_op_kernel->Node().Name() <<
" Status Message: " <<
compute_status.ErrorMessage();
const auto msg_string = ss.str();
LOGS(logger, ERROR) << msg_string;
return Status(compute_status.Category(), compute_status.Code(), msg_string);
#ifdef CONCURRENCY_VISUALIZER
{
diagnostic::span span(series, "%s.%d", node.OpType().c_str(), node.Index());
#endif
Status compute_status = p_op_kernel->Compute(&op_kernel_context);
if (!compute_status.IsOK()) {
std::ostringstream ss;
ss << "Non-zero status code returned while running Node: " << node.Name()
<< " Status Message: " << compute_status.ErrorMessage();
const auto msg_string = ss.str();
LOGS(logger, ERROR) << msg_string;
return Status(compute_status.Category(), compute_status.Code(), msg_string);
}
#ifdef CONCURRENCY_VISUALIZER
}
#endif
if (is_profiler_enabled) {
session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,

View file

@ -415,7 +415,7 @@ void Node::CreateSubgraph(const std::string& attr_name) {
if (attr != attributes_.cend() && utils::HasGraph(attr->second)) {
GraphProto& mutable_graph = *attr->second.mutable_g();
std::unique_ptr<Graph> subgraph{new Graph(*graph_, mutable_graph)};
std::unique_ptr<Graph> subgraph{new Graph(*graph_, *this, mutable_graph)};
attr_to_subgraph_map_.insert({std::string{attr_name}, gsl::not_null<Graph*>{subgraph.get()}});
subgraphs_.push_back(std::move(subgraph));
}
@ -624,19 +624,20 @@ Graph::Graph(GraphProto* graph_proto,
const std::unordered_map<std::string, int>& domain_to_version,
Version ir_version,
IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions) : Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, model_functions) {}
const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions)
: Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, nullptr, model_functions) {}
Graph::Graph(GraphProto* graph_proto, const std::unordered_map<std::string, int>& domain_to_version, Version ir_version,
IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph,
IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph, const Node* parent_node,
const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions)
: graph_proto_{graph_proto},
schema_registry_(schema_registry),
graph_resolve_needed_(true),
domain_to_version_(domain_to_version),
model_functions_(model_functions),
ir_version_(ir_version),
parent_graph_{parent_graph} {
parent_graph_{parent_graph},
parent_node_{parent_node} {
ORT_ENFORCE(graph_proto != nullptr, "graph_proto cannot be null");
ArgNameToTypeMap name_to_type_map;
@ -709,10 +710,11 @@ Graph::Graph(GraphProto* graph_proto, const std::unordered_map<std::string, int>
}
}
Graph::Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto)
Graph::Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto)
: Graph(&subgraph_proto,
parent_graph.DomainToVersionMap(), parent_graph.IrVersion(), parent_graph.schema_registry_,
&parent_graph) {
&parent_graph,
&parent_node) {
}
Status Graph::VerifyNoDuplicateName() {

View file

@ -51,6 +51,7 @@ Status PerformanceRunner::Run() {
// TODO: start profiling
// if (!performance_test_config_.run_config.profile_file.empty())
performance_result_.start_ = std::chrono::high_resolution_clock::now();
std::unique_ptr<utils::ICPUUsage> p_ICPUUsage = utils::CreateICPUUsage();
switch (performance_test_config_.run_config.test_mode) {
@ -63,16 +64,20 @@ Status PerformanceRunner::Run() {
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "unknown test mode.");
}
performance_result_.end_ = std::chrono::high_resolution_clock::now();
performance_result_.average_CPU_usage = p_ICPUUsage->GetUsage();
performance_result_.peak_workingset_size = utils::GetPeakWorkingSetSize();
// TODO: end profiling
// if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
std::chrono::duration<double> duration = performance_result_.end_ - performance_result_.start_;
std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl
std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl // sum of time taken by each request
<< "Total iterations:" << performance_result_.time_costs.size() << std::endl
<< "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl;
<< "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl
// Time between start and end of run. Less than Total time cost when running requests in parallel.
<< "Total run time:" << duration.count() << " s" << std::endl;
return Status::OK();
}
@ -129,32 +134,38 @@ Status PerformanceRunner::RunParallelDuration() {
}
Status PerformanceRunner::ForkJoinRepeat() {
// Adding trivially simple parallelization to the repeated times test will simply perform
// m instances of n parallel invocations with a synchronized join after each invocation.
// TODO: When the thread pool implementation is done, redo if it has join semantics.
auto tpool = GetDefaultThreadPool(Env::Default());
std::atomic<int> counter = {0};
const auto& run_config = performance_test_config_.run_config;
// create a threadpool with one thread per concurrent request
auto tpool = std::make_unique<DefaultThreadPoolType>(run_config.concurrent_session_runs);
std::atomic<int> counter{0}, requests{0};
std::mutex m;
std::condition_variable cv;
for (size_t ite = 0; ite < performance_test_config_.run_config.repeated_times; ite++) {
// Fork
counter.load(std::memory_order_seq_cst);
for (size_t i = 0; i != performance_test_config_.run_config.concurrent_session_runs; ++i) {
counter++;
tpool->Schedule([this, &counter, &m, &cv]() {
session_->ThreadSafeRun();
// Simplified version of Eigen::Barrier
std::lock_guard<std::mutex> lg(m);
counter--;
cv.notify_all();
});
}
// Fork
counter.load(std::memory_order_seq_cst);
requests.load(std::memory_order_seq_cst);
//Join
std::unique_lock<std::mutex> lock(m);
cv.wait(lock, [&counter]() { return counter == 0; });
for (size_t i = 0; i != run_config.concurrent_session_runs; ++i) {
counter++;
tpool->Schedule([this, &counter, &requests, &m, &cv, &run_config]() {
while (requests++ < static_cast<int>(run_config.repeated_times)) {
auto status = RunOneIteration<false>();
if (!status.IsOK())
std::cerr << status.ErrorMessage();
}
// Simplified version of Eigen::Barrier
std::lock_guard<std::mutex> lg(m);
counter--;
cv.notify_all();
});
}
//Join
std::unique_lock<std::mutex> lock(m);
cv.wait(lock, [&counter]() { return counter == 0; });
return Status::OK();
}

View file

@ -10,6 +10,7 @@
#include <mutex>
#include <iostream>
#include <random>
#include <chrono>
// onnxruntime dependencies
#include <core/common/common.h>
#include <core/common/status.h>
@ -27,6 +28,8 @@ namespace onnxruntime {
namespace perftest {
struct PerformanceResult {
std::chrono::time_point<std::chrono::high_resolution_clock> start_;
std::chrono::time_point<std::chrono::high_resolution_clock> end_;
size_t peak_workingset_size{0};
short average_CPU_usage{0};
double total_time_cost{0};
@ -58,13 +61,18 @@ struct PerformanceResult {
std::sort(sorted_time.begin(), sorted_time.end());
outfile << std::endl;
outfile << "Min Latency is " << sorted_time[0] << "sec" << std::endl;
outfile << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl;
outfile << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl;
outfile << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl;
outfile << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl;
outfile << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl;
outfile << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl;
auto output_stats = [&](std::ostream& ostream) {
ostream << "Min Latency is " << sorted_time[0] << "sec" << std::endl;
ostream << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl;
ostream << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl;
ostream << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl;
ostream << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl;
ostream << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl;
ostream << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl;
};
output_stats(outfile);
output_stats(std::cout);
}
outfile.close();