mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-20 02:07:56 +00:00
334 lines
12 KiB
C++
334 lines
12 KiB
C++
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// Licensed under the MIT License.
|
|
|
|
// TODO: Remove when removing Eigen
|
|
#if defined(_MSC_VER)
|
|
#pragma warning(disable : 4267)
|
|
#endif
|
|
|
|
#include "performance_runner.h"
|
|
#include <iostream>
|
|
|
|
#include "TestCase.h"
|
|
#include "TFModelInfo.h"
|
|
#include "utils.h"
|
|
#include "ort_test_session.h"
|
|
#ifdef HAVE_TENSORFLOW
|
|
#include "tf_test_session.h"
|
|
#endif
|
|
using onnxruntime::Status;
|
|
|
|
// TODO: Temporary, while we bring up the threadpool impl...
|
|
#include "core/platform/threadpool.h"
|
|
#if defined(__GNUC__)
|
|
#pragma GCC diagnostic push
|
|
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
|
#pragma GCC diagnostic ignored "-Wunused-result"
|
|
#endif
|
|
#include <unsupported/Eigen/CXX11/ThreadPool>
|
|
#if defined(__GNUC__)
|
|
#pragma GCC diagnostic pop
|
|
#endif
|
|
using DefaultThreadPoolType = Eigen::ThreadPool;
|
|
static std::unique_ptr<DefaultThreadPoolType> default_pool;
|
|
static std::once_flag default_pool_init;
|
|
Eigen::ThreadPoolInterface* GetDefaultThreadPool(const onnxruntime::Env& env) {
|
|
std::call_once(default_pool_init, [&env] {
|
|
int core_num = env.GetNumCpuCores();
|
|
default_pool.reset(new DefaultThreadPoolType(core_num));
|
|
});
|
|
return default_pool.get();
|
|
}
|
|
|
|
namespace onnxruntime {
|
|
namespace perftest {
|
|
|
|
void PerformanceResult::DumpToFile(const std::basic_string<ORTCHAR_T>& path, bool f_include_statistics) const {
|
|
bool have_file = !path.empty();
|
|
std::ofstream outfile;
|
|
|
|
if (have_file) {
|
|
outfile.open(path, std::ofstream::out | std::ofstream::app);
|
|
if (!outfile.good()) {
|
|
// at least provide some info on the run
|
|
std::cerr << "failed to open result file '" << ToMBString(path.c_str()) << "'. will dump stats to output.\n";
|
|
have_file = false;
|
|
f_include_statistics = true;
|
|
}
|
|
}
|
|
|
|
if (have_file) {
|
|
for (size_t runs = 0; runs < time_costs.size(); runs++) {
|
|
outfile << model_name << "," << time_costs[runs] << "," << peak_workingset_size << ","
|
|
<< average_CPU_usage << "," << runs << std::endl;
|
|
}
|
|
} else {
|
|
// match formatting of the initial output from PerformanceRunner::Run
|
|
std::cout << "Avg CPU usage:" << average_CPU_usage
|
|
<< "\nPeak working set size:" << peak_workingset_size
|
|
<< "\nRuns:" << time_costs.size() << std::endl;
|
|
}
|
|
|
|
if (!time_costs.empty() && f_include_statistics) {
|
|
std::vector<double> sorted_time = time_costs;
|
|
|
|
size_t total = sorted_time.size();
|
|
size_t n50 = static_cast<size_t>(total * 0.5);
|
|
size_t n90 = static_cast<size_t>(total * 0.9);
|
|
size_t n95 = static_cast<size_t>(total * 0.95);
|
|
size_t n99 = static_cast<size_t>(total * 0.99);
|
|
size_t n999 = static_cast<size_t>(total * 0.999);
|
|
|
|
std::sort(sorted_time.begin(), sorted_time.end());
|
|
|
|
auto output_stats = [&](std::ostream& ostream) {
|
|
ostream << "Min Latency: " << sorted_time[0] << " s\n";
|
|
ostream << "Max Latency: " << sorted_time[total - 1] << " s\n";
|
|
ostream << "P50 Latency: " << sorted_time[n50] << " s\n";
|
|
ostream << "P90 Latency: " << sorted_time[n90] << " s\n";
|
|
ostream << "P95 Latency: " << sorted_time[n95] << " s\n";
|
|
ostream << "P99 Latency: " << sorted_time[n99] << " s\n";
|
|
ostream << "P999 Latency: " << sorted_time[n999] << " s" << std::endl;
|
|
};
|
|
|
|
if (have_file) {
|
|
outfile << std::endl;
|
|
output_stats(outfile);
|
|
}
|
|
|
|
output_stats(std::cout);
|
|
}
|
|
}
|
|
|
|
Status PerformanceRunner::Run() {
|
|
if (!Initialize()) {
|
|
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "failed to initialize.");
|
|
}
|
|
|
|
// warm up
|
|
RunOneIteration<true>();
|
|
|
|
// TODO: start profiling
|
|
// if (!performance_test_config_.run_config.profile_file.empty())
|
|
performance_result_.start = std::chrono::high_resolution_clock::now();
|
|
|
|
std::unique_ptr<utils::ICPUUsage> p_ICPUUsage = utils::CreateICPUUsage();
|
|
switch (performance_test_config_.run_config.test_mode) {
|
|
case TestMode::kFixDurationMode:
|
|
ORT_RETURN_IF_ERROR(FixDurationTest());
|
|
break;
|
|
case TestMode::KFixRepeatedTimesMode:
|
|
ORT_RETURN_IF_ERROR(RepeatedTimesTest());
|
|
break;
|
|
default:
|
|
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "unknown test mode.");
|
|
}
|
|
performance_result_.end = std::chrono::high_resolution_clock::now();
|
|
|
|
performance_result_.average_CPU_usage = p_ICPUUsage->GetUsage();
|
|
performance_result_.peak_workingset_size = utils::GetPeakWorkingSetSize();
|
|
|
|
std::chrono::duration<double> session_create_duration = session_create_end_ - session_create_start_;
|
|
// TODO: end profiling
|
|
// if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
|
|
std::chrono::duration<double> inference_duration = performance_result_.end - performance_result_.start;
|
|
|
|
std::cout << "Session creation time cost: " << session_create_duration.count() << " s\n"
|
|
<< "Total inference time cost: " << performance_result_.total_time_cost << " s\n" // sum of time taken by each request
|
|
<< "Total inference requests: " << performance_result_.time_costs.size() << "\n"
|
|
<< "Average inference time cost: " << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms\n"
|
|
// Time between start and end of run. Less than Total time cost when running requests in parallel.
|
|
<< "Total inference run time: " << inference_duration.count() << " s\n"
|
|
<< "Avg CPU usage: " << performance_result_.average_CPU_usage << " %\n"
|
|
<< "Peak working set size: " << performance_result_.peak_workingset_size << " bytes"
|
|
<< std::endl;
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status PerformanceRunner::FixDurationTest() {
|
|
if (performance_test_config_.run_config.concurrent_session_runs <= 1) {
|
|
return RunFixDuration();
|
|
}
|
|
|
|
return RunParallelDuration();
|
|
}
|
|
|
|
Status PerformanceRunner::RepeatedTimesTest() {
|
|
if (performance_test_config_.run_config.concurrent_session_runs <= 1) {
|
|
return RunRepeatedTimes();
|
|
}
|
|
|
|
return ForkJoinRepeat();
|
|
}
|
|
|
|
Status PerformanceRunner::RunParallelDuration() {
|
|
// Simple method to continually queue parallel work until the timer has run down.
|
|
// TODO: Make each thread enqueue a new worker.
|
|
auto tpool = GetDefaultThreadPool(Env::Default());
|
|
std::atomic<int> counter = {0};
|
|
OrtMutex m;
|
|
OrtCondVar cv;
|
|
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
auto end = start;
|
|
std::chrono::duration<double> duration_seconds;
|
|
do {
|
|
// We will queue work as deep as requested, ignoring the size of the threadpool itself
|
|
int count = counter.load(std::memory_order_seq_cst);
|
|
while (count < static_cast<int>(performance_test_config_.run_config.concurrent_session_runs)) {
|
|
count++;
|
|
counter++;
|
|
tpool->Schedule([this, &counter, &m, &cv]() {
|
|
session_->ThreadSafeRun();
|
|
// Simplified version of Eigen::Barrier
|
|
std::lock_guard<OrtMutex> lg(m);
|
|
counter--;
|
|
cv.notify_all();
|
|
});
|
|
}
|
|
end = std::chrono::high_resolution_clock::now();
|
|
duration_seconds = end - start;
|
|
} while (duration_seconds.count() < performance_test_config_.run_config.duration_in_seconds);
|
|
|
|
//Join
|
|
std::unique_lock<OrtMutex> lock(m);
|
|
cv.wait(lock, [&counter]() { return counter == 0; });
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status PerformanceRunner::ForkJoinRepeat() {
|
|
const auto& run_config = performance_test_config_.run_config;
|
|
|
|
// create a threadpool with one thread per concurrent request
|
|
auto tpool = std::make_unique<DefaultThreadPoolType>(run_config.concurrent_session_runs);
|
|
std::atomic<int> counter{0}, requests{0};
|
|
OrtMutex m;
|
|
OrtCondVar cv;
|
|
|
|
// Fork
|
|
for (size_t i = 0; i != run_config.concurrent_session_runs; ++i) {
|
|
counter++;
|
|
tpool->Schedule([this, &counter, &requests, &m, &cv, &run_config]() {
|
|
while (requests++ < static_cast<int>(run_config.repeated_times)) {
|
|
auto status = RunOneIteration<false>();
|
|
if (!status.IsOK())
|
|
std::cerr << status.ErrorMessage();
|
|
}
|
|
|
|
// Simplified version of Eigen::Barrier
|
|
std::lock_guard<OrtMutex> lg(m);
|
|
counter--;
|
|
cv.notify_all();
|
|
});
|
|
}
|
|
|
|
//Join
|
|
std::unique_lock<OrtMutex> lock(m);
|
|
cv.wait(lock, [&counter]() { return counter == 0; });
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
static std::unique_ptr<TestModelInfo> CreateModelInfo(const PerformanceTestConfig& performance_test_config_) {
|
|
if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("ort")) == 0) {
|
|
const auto& file_path = performance_test_config_.model_info.model_file_path;
|
|
#if !defined(ORT_MINIMAL_BUILD)
|
|
if (HasExtensionOf(file_path, ORT_TSTR("onnx"))) {
|
|
return TestModelInfo::LoadOnnxModel(performance_test_config_.model_info.model_file_path.c_str());
|
|
}
|
|
#endif
|
|
|
|
#if defined(ENABLE_ORT_FORMAT_LOAD)
|
|
if (HasExtensionOf(file_path, ORT_TSTR("ort"))) {
|
|
return TestModelInfo::LoadOrtModel(performance_test_config_.model_info.model_file_path.c_str());
|
|
}
|
|
#endif
|
|
|
|
ORT_NOT_IMPLEMENTED(ToMBString(file_path), " is not supported");
|
|
}
|
|
|
|
if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("tf")) == 0) {
|
|
return TFModelInfo::Create(performance_test_config_.model_info.model_file_path.c_str());
|
|
}
|
|
|
|
ORT_NOT_IMPLEMENTED(ToMBString(performance_test_config_.backend), " is not supported");
|
|
}
|
|
|
|
static std::unique_ptr<TestSession> CreateSession(Ort::Env& env, std::random_device& rd,
|
|
const PerformanceTestConfig& performance_test_config_,
|
|
const TestModelInfo& test_model_info) {
|
|
if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("ort")) == 0) {
|
|
return std::unique_ptr<TestSession>(
|
|
new OnnxRuntimeTestSession(env, rd, performance_test_config_, test_model_info));
|
|
}
|
|
#ifdef HAVE_TENSORFLOW
|
|
if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("tf")) == 0) {
|
|
return new TensorflowTestSession(rd, performance_test_config_, test_model_info);
|
|
}
|
|
#endif
|
|
ORT_NOT_IMPLEMENTED(ToMBString(performance_test_config_.backend), " is not supported");
|
|
}
|
|
|
|
PerformanceRunner::PerformanceRunner(Ort::Env& env, const PerformanceTestConfig& test_config, std::random_device& rd)
|
|
: performance_test_config_(test_config),
|
|
test_model_info_(CreateModelInfo(test_config)) {
|
|
session_create_start_ = std::chrono::high_resolution_clock::now();
|
|
session_ = CreateSession(env, rd, test_config, *test_model_info_);
|
|
session_create_end_ = std::chrono::high_resolution_clock::now();
|
|
}
|
|
|
|
PerformanceRunner::~PerformanceRunner() = default;
|
|
|
|
bool PerformanceRunner::Initialize() {
|
|
std::basic_string<PATH_CHAR_TYPE> test_case_dir;
|
|
auto st = GetDirNameFromFilePath(performance_test_config_.model_info.model_file_path, test_case_dir);
|
|
if (!st.IsOK()) {
|
|
printf("input path is not a valid model\n");
|
|
return false;
|
|
}
|
|
std::basic_string<PATH_CHAR_TYPE> model_name = GetLastComponent(test_case_dir);
|
|
// TODO: remove the input and model name's dependency on directory tree
|
|
if (CompareCString(model_name.c_str(), ORT_TSTR("test_")) == 0) {
|
|
model_name = model_name.substr(5);
|
|
}
|
|
std::string narrow_model_name = ToMBString(model_name);
|
|
performance_result_.model_name = narrow_model_name;
|
|
|
|
// ownership semantics are a little unexpected here as the test case takes ownership of the model info
|
|
TestModelInfo* test_model_info = test_model_info_.get();
|
|
test_case_ = CreateOnnxTestCase(narrow_model_name, std::move(test_model_info_), 0.0, 0.0);
|
|
|
|
if (performance_test_config_.run_config.generate_model_input_binding) {
|
|
return static_cast<OnnxRuntimeTestSession*>(session_.get())->PopulateGeneratedInputTestData();
|
|
}
|
|
|
|
// TODO: Place input tensor on cpu memory if dnnl provider type to avoid CopyTensor logic in CopyInputAcrossDevices
|
|
size_t test_data_count = test_case_->GetDataCount();
|
|
if (test_data_count == 0) {
|
|
std::cout << "there is no test data for model " << test_case_->GetTestCaseName() << std::endl;
|
|
return false;
|
|
}
|
|
for (size_t test_data_id = 0; test_data_id != test_data_count; ++test_data_id) {
|
|
std::unordered_map<std::string, Ort::Value> feeds;
|
|
test_case_->LoadTestData(test_data_id /* id */, b_, feeds, true);
|
|
// Discard the names in feeds
|
|
int input_count = test_model_info->GetInputCount();
|
|
for (int i = 0; i != input_count; ++i) {
|
|
auto iter = feeds.find(test_model_info->GetInputName(i));
|
|
if (iter == feeds.end()) {
|
|
std::cout << "there is no test input data for input " << test_model_info->GetInputName(i) << " and model "
|
|
<< test_case_->GetTestCaseName() << std::endl;
|
|
return false;
|
|
}
|
|
session_->PreLoadTestData(test_data_id, static_cast<size_t>(i), std::move(iter->second));
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
} // namespace perftest
|
|
|
|
} // namespace onnxruntime
|