TensorRT EP - timing cache (#14767)

### Description

This will enable a user to use a TensorRT timing cache based on #10297
to accelerate build times on a device with the same compute capability.
This will work across models as it simply store kernel runtimes for
specific configurations. Those files are usually very small (only a few
MB) which makes them very easy to ship with an application to accelerate
the build time on the user end.

### Motivation and Context
Especially for workstation use cases TRT build times can be a roadblock.
With a few model from ONNX model zoo i evaluated speedups when a timing
cache is present.
`./build/onnxruntime_perf_test -e tensorrt -I -t 5 -i
"trt_timing_cache_enable|true" <onnx_path>`

|Model | no Cache | with Cache|
| ------------- | ------------- | ------------- |
|efficientnet-lite4-11 | 34.6 s | 7.7 s|
|yolov4 | 108.62 s | 9.4 s|

To capture this is had to modify the onnxruntime_perf_test. The time is
sometimes not captured within "Session creation time cost:" which is why
i introduced "First inference time cost:".

---------

Co-authored-by: Chi Lo <Chi.Lo@microsoft.com>
This commit is contained in:
Maximilian Müller 2023-03-10 18:02:27 +01:00 committed by GitHub
parent acbb7ad453
commit ad4db12699
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 303 additions and 18 deletions

View file

@ -455,7 +455,7 @@ if (onnxruntime_USE_CUDA)
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64)
else()
if(onnxruntime_CUDNN_HOME)
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib64)
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64)
endif()
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64)
endif()

View file

@ -5,7 +5,7 @@
/// <summary>
/// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT_V2.
/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
/// Going forward, new trt provider options are to be supported via this struct and usage of the publicly defined
/// OrtTensorRTProviderOptions will be deprecated over time.
/// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
@ -31,4 +31,7 @@ struct OrtTensorRTProviderOptionsV2 {
int trt_force_sequential_engine_build; // force building TensorRT engine sequentially. Default 0 = false, nonzero = true
int trt_context_memory_sharing_enable; // enable context memory sharing between subgraphs. Default 0 = false, nonzero = true
int trt_layer_norm_fp32_fallback; // force Pow + Reduce ops in layer norm to FP32. Default 0 = false, nonzero = true
int trt_timing_cache_enable; // enable TensorRT timing cache. Default 0 = false, nonzero = true
int trt_force_timing_cache; // force the TensorRT cache to be used even if device profile does not match. Default 0 = false, nonzero = true
int trt_detailed_build_log; // Enable detailed build step logging on TensorRT EP with timing for each engine build. Default 0 = false, nonzero = true
};

View file

@ -117,6 +117,32 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
}
return true;
}
inline std::vector<char> loadTimingCacheFile(const std::string inFileName) {
std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
if (!iFile) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not read timing cache from: " << inFileName
<< ". A new timing cache will be generated and written.";
return std::vector<char>();
}
iFile.seekg(0, std::ifstream::end);
size_t fsize = iFile.tellg();
iFile.seekg(0, std::ifstream::beg);
std::vector<char> content(fsize);
iFile.read(content.data(), fsize);
iFile.close();
return content;
}
inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) {
std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
if (!oFile) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not write timing cache to: " << outFileName;
return;
}
oFile.write((char*)blob->data(), blob->size());
oFile.close();
}
} // namespace
namespace google {
@ -312,7 +338,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
}
dump_subgraphs_ = info.dump_subgraphs;
engine_cache_enable_ = info.engine_cache_enable;
if (engine_cache_enable_ || int8_enable_) {
timing_cache_enable_ = info.timing_cache_enable;
force_timing_cache_match_ = info.force_timing_cache;
detailed_build_log_ = info.detailed_build_log;
if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
cache_path_ = info.engine_cache_path;
}
engine_decryption_enable_ = info.engine_decryption_enable;
@ -386,7 +415,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true);
}
if (engine_cache_enable_ || int8_enable_) {
const std::string timing_cache_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTimingCacheEnable);
if (!timing_cache_enable_env.empty()) {
timing_cache_enable_ = (std::stoi(timing_cache_enable_env) == 0 ? false : true);
}
const std::string detailed_build_log_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDetailedBuildLog);
if (!detailed_build_log_env.empty()) {
detailed_build_log_ = (std::stoi(detailed_build_log_env) == 0 ? false : true);
}
const std::string timing_force_match_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kForceTimingCache);
if (!timing_force_match_env.empty()) {
force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true);
}
if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
if (!engine_cache_path.empty() && cache_path_.empty()) {
@ -438,7 +482,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
dla_core_ = 0;
}
if (engine_cache_enable_ || int8_enable_) {
if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
if (!fs::create_directory(cache_path_)) {
throw std::runtime_error("Failed to create directory " + cache_path_);
@ -1373,6 +1417,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
if (!has_dynamic_shape) {
const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
const std::string engine_cache_path = cache_path + ".engine";
std::string timing_cache_path = "";
if (timing_cache_enable_) {
cudaDeviceProp prop;
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
timing_cache_path = GetTimingCachePath(cache_path_, prop);
}
{
// ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
auto lock = GetApiLock();
@ -1419,12 +1469,36 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
}
}
// Load timing cache from file. Create a fresh cache if the file doesn't exist
std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
if (timing_cache_enable_) {
std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
if (timing_cache == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not create timing cache: " + timing_cache_path);
}
trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
if (detailed_build_log_) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
}
}
// Build engine
std::chrono::steady_clock::time_point engine_build_start;
if (detailed_build_log_) {
engine_build_start = std::chrono::steady_clock::now();
}
trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(trt_builder->buildEngineWithConfig(*trt_network, *trt_config));
if (trt_engine == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not build engine for fused node: " + fused_node.Name());
}
if (detailed_build_log_) {
auto engine_build_stop = std::chrono::steady_clock::now();
LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " <<
std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
}
if (engine_cache_enable_) {
std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
size_t engine_size = serializedModel->size();
@ -1438,7 +1512,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
}
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
}
// serialize and save timing cache
if (timing_cache_enable_) {
auto timing_cache = trt_config->getTimingCache();
std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
if (timingCacheHostData == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not serialize timing cache: " + timing_cache_path);
}
saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
if (detailed_build_log_) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
}
}
}
}
@ -1504,7 +1591,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
runtime_.get(), nullptr, allocator_, context_memory_sharing_enable_, &max_ctx_mem_size_, &context_memory_,
dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_};
dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
force_timing_cache_match_, detailed_build_log_};
*state = p.release();
return 0;
};
@ -1545,6 +1633,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
const std::string engine_cache_path = cache_path + ".engine";
const std::string profile_cache_path = cache_path + ".profile";
std::string timing_cache_path = "";
if (timing_cache_enable_) {
cudaDeviceProp prop;
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
timing_cache_path = GetTimingCachePath(cache_path_, prop);
}
if (trt_state->engine_cache_enable && trt_engine == nullptr) {
std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
@ -1779,11 +1873,35 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
trt_config->setDLACore(trt_state->dla_core);
}
// Load timing cache from file. Create a fresh cache if the file doesn't exist
std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
if (trt_state->timing_cache_enable) {
std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
if (timing_cache == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not create timing cache: " + timing_cache_path);
}
trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
if (detailed_build_log_) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
}
}
// Build engine
{
auto lock = GetApiLock();
std::chrono::steady_clock::time_point engine_build_start;
if (detailed_build_log_) {
engine_build_start = std::chrono::steady_clock::now();
}
*(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
if (detailed_build_log_) {
auto engine_build_stop = std::chrono::steady_clock::now();
LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " <<
std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
}
}
if (trt_state->engine == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
@ -1809,6 +1927,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
}
}
// serialize and save timing cache
if (trt_state->timing_cache_enable) {
auto timing_cache = trt_config->getTimingCache();
std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
if (timingCacheHostData == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not serialize timing cache: " + timing_cache_path);
}
saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
if (detailed_build_log_) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
}
}
// Build context
if (trt_state->context_memory_sharing_enable) {
*(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(

View file

@ -30,6 +30,9 @@ static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LI
static const std::string kForceSequentialEngineBuild = "ORT_TENSORRT_FORCE_SEQUENTIAL_ENGINE_BUILD";
static const std::string kContextMemorySharingEnable = "ORT_TENSORRT_CONTEXT_MEMORY_SHARING_ENABLE";
static const std::string kLayerNormFP32Fallback = "ORT_TENSORRT_LAYER_NORM_FP32_FALLBACK";
static const std::string kTimingCacheEnable = "ORT_TENSORRT_TIMING_CACHE_ENABLE";
static const std::string kForceTimingCache = "ORT_TENSORRT_FORCE_TIMING_CACHE_ENABLE";
static const std::string kDetailedBuildLog = "ORT_TENSORRT_DETAILED_BUILD_LOG_ENABLE";
// Old env variable for backward compatibility
static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
} // namespace tensorrt_env_vars
@ -114,6 +117,9 @@ struct TensorrtFuncState {
bool engine_decryption_enable = false;
int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
int (*engine_encryption)(const char*, char*, size_t) = nullptr;
bool timing_cache_enable = true;
bool force_timing_cache = false;
bool detailed_build_log = false;
};
// Logical device representation.
@ -176,6 +182,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
bool engine_decryption_enable_ = false;
int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
bool timing_cache_enable_ = false;
bool force_timing_cache_match_ = false;
bool detailed_build_log_ = false;
std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;

View file

@ -27,11 +27,14 @@ constexpr const char* kCachePath = "trt_engine_cache_path";
constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable";
constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path";
constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine_build";
// add new provider option name here.
// add new provider option name here.
constexpr const char* kContextMemorySharingEnable = "trt_context_memory_sharing_enable";
constexpr const char* kLayerNormFP32Fallback = "trt_layer_norm_fp32_fallback";
constexpr const char* kTimingCacheEnable = "trt_timing_cache_enable";
constexpr const char* kForceTimingCacheMatch = "trt_force_timing_cache_match";
constexpr const char* kDetailedBuildLog = "trt_detailed_build_log";
} // namespace provider_option_names
} // namespace tensorrt
} // namespace tensorrt
TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
TensorrtExecutionProviderInfo info{};
@ -57,15 +60,17 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
.AddAssignmentToReference(tensorrt::provider_option_names::kInt8CalibTable, info.int8_calibration_table_name)
.AddAssignmentToReference(tensorrt::provider_option_names::kInt8UseNativeCalibTable, info.int8_use_native_calibration_table)
.AddAssignmentToReference(tensorrt::provider_option_names::kDLAEnable, info.dla_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)
.AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)
.AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
.AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kCachePath, info.engine_cache_path)
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
.AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build)
.AddAssignmentToReference(tensorrt::provider_option_names::kContextMemorySharingEnable, info.context_memory_sharing_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kLayerNormFP32Fallback, info.layer_norm_fp32_fallback)
.AddAssignmentToReference(tensorrt::provider_option_names::kTimingCacheEnable, info.timing_cache_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kForceTimingCacheMatch, info.force_timing_cache)
.Parse(options)); // add new provider option here.
return info;
@ -93,6 +98,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
// add new provider option here.
{tensorrt::provider_option_names::kContextMemorySharingEnable, MakeStringWithClassicLocale(info.context_memory_sharing_enable)},
{tensorrt::provider_option_names::kLayerNormFP32Fallback, MakeStringWithClassicLocale(info.layer_norm_fp32_fallback)},
{tensorrt::provider_option_names::kTimingCacheEnable, MakeStringWithClassicLocale(info.timing_cache_enable)},
};
return options;
}

View file

@ -17,10 +17,10 @@ struct TensorrtExecutionProviderInfo {
void* user_compute_stream{nullptr};
bool has_trt_options{false};
int max_partition_iterations{1000};
int min_subgraph_size{1};
int min_subgraph_size{1};
size_t max_workspace_size{1 << 30};
bool fp16_enable{false};
bool int8_enable{false};
bool int8_enable{false};
std::string int8_calibration_table_name{""};
bool int8_use_native_calibration_table{false};
bool dla_enable{false};
@ -33,6 +33,9 @@ struct TensorrtExecutionProviderInfo {
bool force_sequential_engine_build{false};
bool context_memory_sharing_enable{false};
bool layer_norm_fp32_fallback{false};
bool timing_cache_enable{false};
bool force_timing_cache{false};
bool detailed_build_log{false};
static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);

View file

@ -168,6 +168,17 @@ std::string GetCachePath(const std::string& root, const std::string& name) {
}
}
/*
* Get Timing by compute capability
*
*/
std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) {
// append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
const std::string timing_cache_name = "TensorrtExecutionProvider_cache_cc" +
std::to_string(prop.major * 10 + prop.minor) + ".timing";
return GetCachePath(root, timing_cache_name);
}
/*
* Get cache by type
*

View file

@ -70,6 +70,8 @@ struct Tensorrt_Provider : Provider {
info.force_sequential_engine_build = options.trt_force_sequential_engine_build != 0;
info.context_memory_sharing_enable = options.trt_context_memory_sharing_enable != 0;
info.layer_norm_fp32_fallback = options.trt_layer_norm_fp32_fallback != 0;
info.timing_cache_enable = options.trt_timing_cache_enable != 0;
info.detailed_build_log = options.trt_detailed_build_log != 0;
return std::make_shared<TensorrtProviderFactory>(info);
}
@ -137,6 +139,8 @@ struct Tensorrt_Provider : Provider {
trt_options.trt_force_sequential_engine_build = internal_options.force_sequential_engine_build;
trt_options.trt_context_memory_sharing_enable = internal_options.context_memory_sharing_enable;
trt_options.trt_layer_norm_fp32_fallback = internal_options.layer_norm_fp32_fallback;
trt_options.trt_timing_cache_enable = internal_options.timing_cache_enable;
trt_options.trt_force_timing_cache = internal_options.force_timing_cache;
}
ProviderOptions GetProviderOptions(const void* provider_options) override {

View file

@ -1274,7 +1274,10 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
trt_options_converted.trt_engine_decryption_lib_path = legacy_trt_options->trt_engine_decryption_lib_path;
trt_options_converted.trt_force_sequential_engine_build = legacy_trt_options->trt_force_sequential_engine_build;
// Add new provider option below
// Use default value as this field is not available in OrtTensorRTProviderOptionsV
// Use default value as this field is not available in OrtTensorRTProviderOptions
trt_options_converted.trt_timing_cache_enable = 0;
trt_options_converted.trt_force_timing_cache = 0;
trt_options_converted.trt_detailed_build_log = 0;
trt_options_converted.trt_context_memory_sharing_enable = 0;
trt_options_converted.trt_layer_norm_fp32_fallback = 0;
return trt_options_converted;
@ -1601,6 +1604,9 @@ ORT_API_STATUS_IMPL(OrtApis::CreateTensorRTProviderOptions, _Outptr_ OrtTensorRT
(*out)->trt_force_sequential_engine_build = false;
(*out)->trt_context_memory_sharing_enable = false;
(*out)->trt_layer_norm_fp32_fallback = false;
(*out)->trt_timing_cache_enable = false;
(*out)->trt_force_timing_cache = false;
(*out)->trt_detailed_build_log = false;
return nullptr;
#else
ORT_UNUSED_PARAMETER(out);

View file

@ -368,6 +368,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
nullptr,
0,
0,
0,
0,
0,
0};
for (auto option : it->second) {
if (option.first == "device_id") {

View file

@ -121,6 +121,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
bool trt_force_sequential_engine_build = false;
bool trt_context_memory_sharing_enable = false;
bool trt_layer_norm_fp32_fallback = false;
bool trt_timing_cache_enable = false;
bool trt_force_timing_cache = false;
bool trt_detailed_build_log = false;
#ifdef _MSC_VER
std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
@ -268,6 +271,30 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
} else {
ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_layer_norm_fp32_fallback' should be a boolean i.e. true or false. Default value is false.\n");
}
} else if (key == "trt_timing_cache_enable") {
if (value == "true" || value == "True") {
trt_timing_cache_enable = true;
} else if (value == "false" || value == "False") {
trt_timing_cache_enable = false;
} else {
ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_timing_cache_enable' should be a boolean i.e. true or false. Default value is false.\n");
}
} else if (key == "trt_force_timing_cache") {
if (value == "true" || value == "True") {
trt_force_timing_cache = true;
} else if (value == "false" || value == "False") {
trt_force_timing_cache = false;
} else {
ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_force_timing_cache' should be a boolean i.e. true or false. Default value is false.\n");
}
} else if (key == "trt_detailed_build_log") {
if (value == "true" || value == "True") {
trt_detailed_build_log = true;
} else if (value == "false" || value == "False") {
trt_detailed_build_log = false;
} else {
ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_detailed_build_log' should be a boolean i.e. true or false. Default value is false.\n");
}
} else {
ORT_THROW("[ERROR] [TensorRT] wrong key type entered. Choose from the following runtime key options that are available for TensorRT. ['device_id', 'trt_max_partition_iterations', 'trt_min_subgraph_size', 'trt_max_workspace_size', 'trt_fp16_enable', 'trt_int8_enable', 'trt_int8_calibration_table_name', 'trt_int8_use_native_calibration_table', 'trt_dla_enable', 'trt_dla_core', 'trt_dump_subgraphs', 'trt_engine_cache_enable', 'trt_engine_cache_path', 'trt_engine_decryption_enable', 'trt_engine_decryption_lib_path', 'trt_force_sequential_engine_build', 'trt_context_memory_sharing_enable', 'trt_layer_norm_fp32_fallback'] \n");
}
@ -293,6 +320,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
tensorrt_options.trt_force_sequential_engine_build = trt_force_sequential_engine_build;
tensorrt_options.trt_context_memory_sharing_enable = trt_context_memory_sharing_enable;
tensorrt_options.trt_layer_norm_fp32_fallback = trt_layer_norm_fp32_fallback;
tensorrt_options.trt_timing_cache_enable = trt_timing_cache_enable;
tensorrt_options.trt_force_timing_cache = trt_force_timing_cache;
tensorrt_options.trt_detailed_build_log = trt_detailed_build_log;
session_options.AppendExecutionProvider_TensorRT_V2(tensorrt_options);
OrtCUDAProviderOptions cuda_options;

View file

@ -114,7 +114,9 @@ Status PerformanceRunner::Run() {
}
// warm up
initial_inference_result_.start = std::chrono::high_resolution_clock::now();
ORT_RETURN_IF_ERROR(RunOneIteration<true>());
initial_inference_result_.end = std::chrono::high_resolution_clock::now();
// TODO: start profiling
// if (!performance_test_config_.run_config.profile_file.empty())
@ -139,9 +141,12 @@ Status PerformanceRunner::Run() {
std::chrono::duration<double> session_create_duration = session_create_end_ - session_create_start_;
// TODO: end profiling
// if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
auto first_inference_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(initial_inference_result_.end - initial_inference_result_.start).count();
std::chrono::duration<double> inference_duration = performance_result_.end - performance_result_.start;
std::cout << "Session creation time cost: " << session_create_duration.count() << " s\n"
<< "First inference time cost: " << first_inference_duration << " ms\n"
<< "Total inference time cost: " << performance_result_.total_time_cost << " s\n" // sum of time taken by each request
<< "Total inference requests: " << performance_result_.time_costs.size() << "\n"
<< "Average inference time cost: " << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms\n"

View file

@ -106,6 +106,7 @@ class PerformanceRunner {
private:
std::chrono::time_point<std::chrono::high_resolution_clock> session_create_start_;
std::chrono::time_point<std::chrono::high_resolution_clock> session_create_end_;
PerformanceResult initial_inference_result_;
PerformanceResult performance_result_;
PerformanceTestConfig performance_test_config_;
std::unique_ptr<TestModelInfo> test_model_info_;

View file

@ -683,7 +683,7 @@ TEST_P(ModelTest, Run) {
if (test_case_name.find(ORT_TSTR("FLOAT16")) != std::string::npos) {
OrtTensorRTProviderOptionsV2 params{0, 0, nullptr, 1000, 1, 1 << 30,
1, // enable fp16
0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0};
0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0, 0, 0, 0};
ortso.AppendExecutionProvider_TensorRT_V2(params);
} else {
OrtTensorRTProviderOptionsV2* ep_option = nullptr;

View file

@ -11,6 +11,8 @@
#include "core/providers/tensorrt/tensorrt_execution_provider_utils.h"
#include <string>
#include <thread>
#include <filesystem>
#include <chrono>
using namespace std;
using namespace ONNX_NAMESPACE;
@ -151,6 +153,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
nullptr,
0,
0,
0,
0,
0,
0};
params.trt_engine_cache_enable = 1;
@ -222,6 +227,9 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string
nullptr,
0,
0,
0,
0,
0,
0};
params.trt_engine_cache_enable = 1;
@ -386,6 +394,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
nullptr,
0,
0,
0,
0,
0,
0};
if (cache_type.compare("engine") == 0) {
@ -498,9 +509,68 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
}
} else if (cache_type.compare("timing") == 0) {
// add test code here
/* Following code block tests the functionality of engine and optimization profile of ORT TRT, including:
* - timing cache cache serialization/de-serialization
* - benefir of usign a timing cache no matter if dynamic / static input
*/
uint64_t compilation_without_cache_ms, compilation_with_cache_ms;
params.trt_timing_cache_enable = 1;
// std::chrono
{
auto start = chrono::steady_clock::now();
std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
auto status = session_object.Load(model_name);
ASSERT_TRUE(status.IsOK());
status = session_object.Initialize();
ASSERT_TRUE(status.IsOK());
// run inference
// TRT timing cache should be created under the situation of non-dynamic/dynamic shape input
status = session_object.Run(run_options, feeds, output_names, &fetches);
auto end = chrono::steady_clock::now();
ASSERT_TRUE(status.IsOK());
VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
ASSERT_TRUE(IsCacheExistedByType("./", ".timing"));
compilation_without_cache_ms = chrono::duration_cast<chrono::microseconds>(end - start).count();
}
// get new session and reinitialize model
// second same inference should resuse the cache and therefore have a faster build
if (input_type.compare("static") == 0) {
{
InferenceSession session_object_new{so, GetEnvironment()};
{
auto start = chrono::steady_clock::now();
std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
EXPECT_TRUE(session_object_new.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
auto status = session_object_new.Load(model_name);
ASSERT_TRUE(status.IsOK());
status = session_object_new.Initialize();
ASSERT_TRUE(status.IsOK());
// run inference
// TRT timing cache should be created under the situation of non-dynamic/dynamic shape input
status = session_object_new.Run(run_options, feeds, output_names, &fetches);
// TODO narrow down actual compilation section
auto end = chrono::steady_clock::now();
ASSERT_TRUE(status.IsOK());
VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
ASSERT_TRUE(IsCacheExistedByType("./", ".timing"));
compilation_with_cache_ms = chrono::duration_cast<chrono::microseconds>(end - start).count();
}
}
ASSERT_TRUE(compilation_with_cache_ms <= compilation_without_cache_ms);
} else {
// TODO test dynamic shapes
}
}
// clean up caches
RemoveCachesByType("./", ".timing");
RemoveCachesByType("./", ".engine");
RemoveCachesByType("./", ".profile");
}
@ -515,11 +585,13 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
* We have following test parameters:
* - engine_static: engine cache enabled with non-dynamic input shape
* - engine_dynamic: engine cache enabled with dynamic input shape
* - timing_static: will be added
* - timing_dynamic: will be added
* - timing_static: timing cache enabled, static input shape
* - timing_dynamic: timing cache enabled, static input shape
*/
INSTANTIATE_TEST_SUITE_P(TensorrtExecutionProviderCacheTests, TensorrtExecutionProviderCacheTest, testing::Values("engine_static",
"engine_dynamic"),
"engine_dynamic",
"timing_static",
"timing_dynamic"),
[](const ::testing::TestParamInfo<TensorrtExecutionProviderCacheTest::ParamType>& info) {return info.param;});
TEST(TensorrtExecutionProviderTest, FunctionTest) {