mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
General performance testing tooling improvements (#1577)
* Miscellaneous updates to help with perf testing
This commit is contained in:
parent
6586afc8eb
commit
3b7f047a49
6 changed files with 116 additions and 50 deletions
|
|
@ -762,12 +762,16 @@ class Graph {
|
|||
/** Returns the mutable parent graph if this is a subgraph */
|
||||
Graph* MutableParentGraph() { return parent_graph_; }
|
||||
|
||||
/** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
|
||||
const Node* ParentNode() const { return parent_node_; }
|
||||
|
||||
/** Construct a Graph instance for a subgraph that is created from a GraphProto attribute in a Node.
|
||||
Inherits some properties from the parent graph.
|
||||
@param parent_graph The Graph containing the Node which has a GraphProto attribute.
|
||||
@param parent_graph The Graph containing the Node that has the GraphProto attribute.
|
||||
@param parent_node The Node that has the GraphProto attribute.
|
||||
@param subgraph_proto The GraphProto from the Node attribute.
|
||||
*/
|
||||
Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto);
|
||||
Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto);
|
||||
|
||||
virtual ~Graph();
|
||||
|
||||
|
|
@ -794,6 +798,7 @@ class Graph {
|
|||
Version ir_version,
|
||||
IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
|
||||
Graph* parent_graph,
|
||||
const Node* parent_node,
|
||||
const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions = {});
|
||||
|
||||
// Add node with specified <node_proto>.
|
||||
|
|
@ -982,6 +987,8 @@ class Graph {
|
|||
|
||||
// the parent graph if this is a subgraph.
|
||||
Graph* parent_graph_;
|
||||
// the node containing the graph if parent_graph_ is not nullptr
|
||||
const Node* parent_node_;
|
||||
|
||||
// NodeArgs that come from outer scope. Used when building a graph so that
|
||||
// these don't get recorded as graph inputs in the GraphProto.
|
||||
|
|
|
|||
|
|
@ -114,6 +114,9 @@ class GraphViewer {
|
|||
*/
|
||||
bool IsConstantInitializer(const std::string& name, bool check_outer_scope) const;
|
||||
|
||||
/** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
|
||||
const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
|
||||
|
||||
private:
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,16 @@
|
|||
#include "core/framework/op_kernel_context_internal.h"
|
||||
#include "core/framework/utils.h"
|
||||
|
||||
// Define this symbol to create Concurrency Visualizer markers.
|
||||
// See https://docs.microsoft.com/en-us/visualstudio/profiling/concurrency-visualizer-sdk
|
||||
// You will need to install Concurrency Visualizer and add the SDK to the project that compiles this file
|
||||
// via Analyze->Concurrency Visualizer->Add SDK to Project...
|
||||
// #define CONCURRENCY_VISUALIZER
|
||||
#ifdef CONCURRENCY_VISUALIZER
|
||||
#include <cvmarkersobj.h>
|
||||
using namespace Concurrency;
|
||||
#endif
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
static Status ReleaseNodeMLValues(ExecutionFrame& frame,
|
||||
|
|
@ -45,6 +55,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
|
|||
|
||||
// uncomment the line below to dump execution plan
|
||||
//std::cout << std::make_pair(p_seq_exec_plan, &session_state) << "\n";
|
||||
const auto* graph_viewer = session_state.GetGraphViewer();
|
||||
|
||||
#ifdef CONCURRENCY_VISUALIZER
|
||||
// need unique name for the series. number of nodes should be good enough for a subgraph
|
||||
char series_name[MaxSeriesNameLengthInChars] = "MainGraph";
|
||||
if (graph_viewer->IsSubgraph()) {
|
||||
auto s = graph_viewer->ParentNode()->Name().substr(0, MaxSeriesNameLengthInChars - 1);
|
||||
std::copy(s.cbegin(), s.cend(), series_name);
|
||||
}
|
||||
|
||||
diagnostic::marker_series series(series_name);
|
||||
#endif
|
||||
|
||||
for (const auto& node_exec_plan : exec_plan_vec) {
|
||||
if (terminate_flag_) {
|
||||
|
|
@ -53,12 +75,18 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
|
|||
}
|
||||
|
||||
auto node_index = node_exec_plan.node_index;
|
||||
const auto& node = *graph_viewer->GetNode(node_exec_plan.node_index);
|
||||
|
||||
#ifdef CONCURRENCY_VISUALIZER
|
||||
series.write_flag(node.Name().c_str());
|
||||
#endif
|
||||
|
||||
auto p_op_kernel = session_state.GetKernel(node_index);
|
||||
|
||||
// if a kernel has been added in the session state, it better be NON-null.
|
||||
if (p_op_kernel == nullptr)
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Got nullptr from GetKernel for node: ",
|
||||
session_state.GetGraphViewer()->GetNode(node_index)->Name());
|
||||
node.Name());
|
||||
|
||||
// construct OpKernelContext
|
||||
// TODO: log kernel inputs?
|
||||
|
|
@ -117,17 +145,24 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
|
|||
kernel_begin_time = session_state.Profiler().StartTime();
|
||||
}
|
||||
|
||||
const auto& compute_status = p_op_kernel->Compute(&op_kernel_context);
|
||||
if (!compute_status.IsOK()) {
|
||||
std::ostringstream ss;
|
||||
ss << "Non-zero status code returned while running Node: " <<
|
||||
p_op_kernel->Node().Name() <<
|
||||
" Status Message: " <<
|
||||
compute_status.ErrorMessage();
|
||||
const auto msg_string = ss.str();
|
||||
LOGS(logger, ERROR) << msg_string;
|
||||
return Status(compute_status.Category(), compute_status.Code(), msg_string);
|
||||
#ifdef CONCURRENCY_VISUALIZER
|
||||
{
|
||||
diagnostic::span span(series, "%s.%d", node.OpType().c_str(), node.Index());
|
||||
#endif
|
||||
|
||||
Status compute_status = p_op_kernel->Compute(&op_kernel_context);
|
||||
if (!compute_status.IsOK()) {
|
||||
std::ostringstream ss;
|
||||
ss << "Non-zero status code returned while running Node: " << node.Name()
|
||||
<< " Status Message: " << compute_status.ErrorMessage();
|
||||
const auto msg_string = ss.str();
|
||||
LOGS(logger, ERROR) << msg_string;
|
||||
return Status(compute_status.Category(), compute_status.Code(), msg_string);
|
||||
}
|
||||
|
||||
#ifdef CONCURRENCY_VISUALIZER
|
||||
}
|
||||
#endif
|
||||
|
||||
if (is_profiler_enabled) {
|
||||
session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
|
||||
|
|
|
|||
|
|
@ -415,7 +415,7 @@ void Node::CreateSubgraph(const std::string& attr_name) {
|
|||
|
||||
if (attr != attributes_.cend() && utils::HasGraph(attr->second)) {
|
||||
GraphProto& mutable_graph = *attr->second.mutable_g();
|
||||
std::unique_ptr<Graph> subgraph{new Graph(*graph_, mutable_graph)};
|
||||
std::unique_ptr<Graph> subgraph{new Graph(*graph_, *this, mutable_graph)};
|
||||
attr_to_subgraph_map_.insert({std::string{attr_name}, gsl::not_null<Graph*>{subgraph.get()}});
|
||||
subgraphs_.push_back(std::move(subgraph));
|
||||
}
|
||||
|
|
@ -624,19 +624,20 @@ Graph::Graph(GraphProto* graph_proto,
|
|||
const std::unordered_map<std::string, int>& domain_to_version,
|
||||
Version ir_version,
|
||||
IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
|
||||
const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions) : Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, model_functions) {}
|
||||
const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions)
|
||||
: Graph(graph_proto, domain_to_version, ir_version, schema_registry, nullptr, nullptr, model_functions) {}
|
||||
|
||||
Graph::Graph(GraphProto* graph_proto, const std::unordered_map<std::string, int>& domain_to_version, Version ir_version,
|
||||
IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph,
|
||||
IOnnxRuntimeOpSchemaCollectionPtr schema_registry, Graph* parent_graph, const Node* parent_node,
|
||||
const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_functions)
|
||||
: graph_proto_{graph_proto},
|
||||
schema_registry_(schema_registry),
|
||||
graph_resolve_needed_(true),
|
||||
|
||||
domain_to_version_(domain_to_version),
|
||||
model_functions_(model_functions),
|
||||
ir_version_(ir_version),
|
||||
parent_graph_{parent_graph} {
|
||||
parent_graph_{parent_graph},
|
||||
parent_node_{parent_node} {
|
||||
ORT_ENFORCE(graph_proto != nullptr, "graph_proto cannot be null");
|
||||
ArgNameToTypeMap name_to_type_map;
|
||||
|
||||
|
|
@ -709,10 +710,11 @@ Graph::Graph(GraphProto* graph_proto, const std::unordered_map<std::string, int>
|
|||
}
|
||||
}
|
||||
|
||||
Graph::Graph(Graph& parent_graph, ONNX_NAMESPACE::GraphProto& subgraph_proto)
|
||||
Graph::Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto)
|
||||
: Graph(&subgraph_proto,
|
||||
parent_graph.DomainToVersionMap(), parent_graph.IrVersion(), parent_graph.schema_registry_,
|
||||
&parent_graph) {
|
||||
&parent_graph,
|
||||
&parent_node) {
|
||||
}
|
||||
|
||||
Status Graph::VerifyNoDuplicateName() {
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ Status PerformanceRunner::Run() {
|
|||
|
||||
// TODO: start profiling
|
||||
// if (!performance_test_config_.run_config.profile_file.empty())
|
||||
performance_result_.start_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
std::unique_ptr<utils::ICPUUsage> p_ICPUUsage = utils::CreateICPUUsage();
|
||||
switch (performance_test_config_.run_config.test_mode) {
|
||||
|
|
@ -63,16 +64,20 @@ Status PerformanceRunner::Run() {
|
|||
default:
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "unknown test mode.");
|
||||
}
|
||||
performance_result_.end_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
performance_result_.average_CPU_usage = p_ICPUUsage->GetUsage();
|
||||
performance_result_.peak_workingset_size = utils::GetPeakWorkingSetSize();
|
||||
|
||||
// TODO: end profiling
|
||||
// if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
|
||||
std::chrono::duration<double> duration = performance_result_.end_ - performance_result_.start_;
|
||||
|
||||
std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl
|
||||
std::cout << "Total time cost:" << performance_result_.total_time_cost << std::endl // sum of time taken by each request
|
||||
<< "Total iterations:" << performance_result_.time_costs.size() << std::endl
|
||||
<< "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl;
|
||||
<< "Average time cost:" << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms" << std::endl
|
||||
// Time between start and end of run. Less than Total time cost when running requests in parallel.
|
||||
<< "Total run time:" << duration.count() << " s" << std::endl;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
@ -129,32 +134,38 @@ Status PerformanceRunner::RunParallelDuration() {
|
|||
}
|
||||
|
||||
Status PerformanceRunner::ForkJoinRepeat() {
|
||||
// Adding trivially simple parallelization to the repeated times test will simply perform
|
||||
// m instances of n parallel invocations with a synchronized join after each invocation.
|
||||
// TODO: When the thread pool implementation is done, redo if it has join semantics.
|
||||
auto tpool = GetDefaultThreadPool(Env::Default());
|
||||
std::atomic<int> counter = {0};
|
||||
const auto& run_config = performance_test_config_.run_config;
|
||||
|
||||
// create a threadpool with one thread per concurrent request
|
||||
auto tpool = std::make_unique<DefaultThreadPoolType>(run_config.concurrent_session_runs);
|
||||
std::atomic<int> counter{0}, requests{0};
|
||||
std::mutex m;
|
||||
std::condition_variable cv;
|
||||
|
||||
for (size_t ite = 0; ite < performance_test_config_.run_config.repeated_times; ite++) {
|
||||
// Fork
|
||||
counter.load(std::memory_order_seq_cst);
|
||||
for (size_t i = 0; i != performance_test_config_.run_config.concurrent_session_runs; ++i) {
|
||||
counter++;
|
||||
tpool->Schedule([this, &counter, &m, &cv]() {
|
||||
session_->ThreadSafeRun();
|
||||
// Simplified version of Eigen::Barrier
|
||||
std::lock_guard<std::mutex> lg(m);
|
||||
counter--;
|
||||
cv.notify_all();
|
||||
});
|
||||
}
|
||||
// Fork
|
||||
counter.load(std::memory_order_seq_cst);
|
||||
requests.load(std::memory_order_seq_cst);
|
||||
|
||||
//Join
|
||||
std::unique_lock<std::mutex> lock(m);
|
||||
cv.wait(lock, [&counter]() { return counter == 0; });
|
||||
for (size_t i = 0; i != run_config.concurrent_session_runs; ++i) {
|
||||
counter++;
|
||||
tpool->Schedule([this, &counter, &requests, &m, &cv, &run_config]() {
|
||||
while (requests++ < static_cast<int>(run_config.repeated_times)) {
|
||||
auto status = RunOneIteration<false>();
|
||||
if (!status.IsOK())
|
||||
std::cerr << status.ErrorMessage();
|
||||
}
|
||||
|
||||
// Simplified version of Eigen::Barrier
|
||||
std::lock_guard<std::mutex> lg(m);
|
||||
counter--;
|
||||
cv.notify_all();
|
||||
});
|
||||
}
|
||||
|
||||
//Join
|
||||
std::unique_lock<std::mutex> lock(m);
|
||||
cv.wait(lock, [&counter]() { return counter == 0; });
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#include <mutex>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <chrono>
|
||||
// onnxruntime dependencies
|
||||
#include <core/common/common.h>
|
||||
#include <core/common/status.h>
|
||||
|
|
@ -27,6 +28,8 @@ namespace onnxruntime {
|
|||
namespace perftest {
|
||||
|
||||
struct PerformanceResult {
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> start_;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> end_;
|
||||
size_t peak_workingset_size{0};
|
||||
short average_CPU_usage{0};
|
||||
double total_time_cost{0};
|
||||
|
|
@ -58,13 +61,18 @@ struct PerformanceResult {
|
|||
std::sort(sorted_time.begin(), sorted_time.end());
|
||||
|
||||
outfile << std::endl;
|
||||
outfile << "Min Latency is " << sorted_time[0] << "sec" << std::endl;
|
||||
outfile << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl;
|
||||
outfile << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl;
|
||||
outfile << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl;
|
||||
outfile << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl;
|
||||
outfile << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl;
|
||||
outfile << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl;
|
||||
auto output_stats = [&](std::ostream& ostream) {
|
||||
ostream << "Min Latency is " << sorted_time[0] << "sec" << std::endl;
|
||||
ostream << "Max Latency is " << sorted_time[total - 1] << "sec" << std::endl;
|
||||
ostream << "P50 Latency is " << sorted_time[n50] << "sec" << std::endl;
|
||||
ostream << "P90 Latency is " << sorted_time[n90] << "sec" << std::endl;
|
||||
ostream << "P95 Latency is " << sorted_time[n95] << "sec" << std::endl;
|
||||
ostream << "P99 Latency is " << sorted_time[n99] << "sec" << std::endl;
|
||||
ostream << "P999 Latency is " << sorted_time[n999] << "sec" << std::endl;
|
||||
};
|
||||
|
||||
output_stats(outfile);
|
||||
output_stats(std::cout);
|
||||
}
|
||||
|
||||
outfile.close();
|
||||
|
|
|
|||
Loading…
Reference in a new issue