mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-03 23:49:44 +00:00
TensorRT EP - timing cache (#14767)
### Description This will enable a user to use a TensorRT timing cache based on #10297 to accelerate build times on a device with the same compute capability. This will work across models as it simply store kernel runtimes for specific configurations. Those files are usually very small (only a few MB) which makes them very easy to ship with an application to accelerate the build time on the user end. ### Motivation and Context Especially for workstation use cases TRT build times can be a roadblock. With a few model from ONNX model zoo i evaluated speedups when a timing cache is present. `./build/onnxruntime_perf_test -e tensorrt -I -t 5 -i "trt_timing_cache_enable|true" <onnx_path>` |Model | no Cache | with Cache| | ------------- | ------------- | ------------- | |efficientnet-lite4-11 | 34.6 s | 7.7 s| |yolov4 | 108.62 s | 9.4 s| To capture this is had to modify the onnxruntime_perf_test. The time is sometimes not captured within "Session creation time cost:" which is why i introduced "First inference time cost:". --------- Co-authored-by: Chi Lo <Chi.Lo@microsoft.com>
This commit is contained in:
parent
acbb7ad453
commit
ad4db12699
15 changed files with 303 additions and 18 deletions
|
|
@ -455,7 +455,7 @@ if (onnxruntime_USE_CUDA)
|
|||
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64)
|
||||
else()
|
||||
if(onnxruntime_CUDNN_HOME)
|
||||
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib64)
|
||||
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64)
|
||||
endif()
|
||||
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
/// <summary>
|
||||
/// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT_V2.
|
||||
/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
|
||||
/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
|
||||
/// Going forward, new trt provider options are to be supported via this struct and usage of the publicly defined
|
||||
/// OrtTensorRTProviderOptions will be deprecated over time.
|
||||
/// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
|
||||
|
|
@ -31,4 +31,7 @@ struct OrtTensorRTProviderOptionsV2 {
|
|||
int trt_force_sequential_engine_build; // force building TensorRT engine sequentially. Default 0 = false, nonzero = true
|
||||
int trt_context_memory_sharing_enable; // enable context memory sharing between subgraphs. Default 0 = false, nonzero = true
|
||||
int trt_layer_norm_fp32_fallback; // force Pow + Reduce ops in layer norm to FP32. Default 0 = false, nonzero = true
|
||||
int trt_timing_cache_enable; // enable TensorRT timing cache. Default 0 = false, nonzero = true
|
||||
int trt_force_timing_cache; // force the TensorRT cache to be used even if device profile does not match. Default 0 = false, nonzero = true
|
||||
int trt_detailed_build_log; // Enable detailed build step logging on TensorRT EP with timing for each engine build. Default 0 = false, nonzero = true
|
||||
};
|
||||
|
|
|
|||
|
|
@ -117,6 +117,32 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline std::vector<char> loadTimingCacheFile(const std::string inFileName) {
|
||||
std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
|
||||
if (!iFile) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not read timing cache from: " << inFileName
|
||||
<< ". A new timing cache will be generated and written.";
|
||||
return std::vector<char>();
|
||||
}
|
||||
iFile.seekg(0, std::ifstream::end);
|
||||
size_t fsize = iFile.tellg();
|
||||
iFile.seekg(0, std::ifstream::beg);
|
||||
std::vector<char> content(fsize);
|
||||
iFile.read(content.data(), fsize);
|
||||
iFile.close();
|
||||
return content;
|
||||
}
|
||||
|
||||
inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) {
|
||||
std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
|
||||
if (!oFile) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not write timing cache to: " << outFileName;
|
||||
return;
|
||||
}
|
||||
oFile.write((char*)blob->data(), blob->size());
|
||||
oFile.close();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace google {
|
||||
|
|
@ -312,7 +338,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
}
|
||||
dump_subgraphs_ = info.dump_subgraphs;
|
||||
engine_cache_enable_ = info.engine_cache_enable;
|
||||
if (engine_cache_enable_ || int8_enable_) {
|
||||
timing_cache_enable_ = info.timing_cache_enable;
|
||||
force_timing_cache_match_ = info.force_timing_cache;
|
||||
detailed_build_log_ = info.detailed_build_log;
|
||||
if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
|
||||
cache_path_ = info.engine_cache_path;
|
||||
}
|
||||
engine_decryption_enable_ = info.engine_decryption_enable;
|
||||
|
|
@ -386,7 +415,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true);
|
||||
}
|
||||
|
||||
if (engine_cache_enable_ || int8_enable_) {
|
||||
const std::string timing_cache_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTimingCacheEnable);
|
||||
if (!timing_cache_enable_env.empty()) {
|
||||
timing_cache_enable_ = (std::stoi(timing_cache_enable_env) == 0 ? false : true);
|
||||
}
|
||||
|
||||
const std::string detailed_build_log_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDetailedBuildLog);
|
||||
if (!detailed_build_log_env.empty()) {
|
||||
detailed_build_log_ = (std::stoi(detailed_build_log_env) == 0 ? false : true);
|
||||
}
|
||||
|
||||
const std::string timing_force_match_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kForceTimingCache);
|
||||
if (!timing_force_match_env.empty()) {
|
||||
force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true);
|
||||
}
|
||||
|
||||
if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
|
||||
const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
|
||||
cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
|
||||
if (!engine_cache_path.empty() && cache_path_.empty()) {
|
||||
|
|
@ -438,7 +482,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
dla_core_ = 0;
|
||||
}
|
||||
|
||||
if (engine_cache_enable_ || int8_enable_) {
|
||||
if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
|
||||
if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
|
||||
if (!fs::create_directory(cache_path_)) {
|
||||
throw std::runtime_error("Failed to create directory " + cache_path_);
|
||||
|
|
@ -1373,6 +1417,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
if (!has_dynamic_shape) {
|
||||
const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
|
||||
const std::string engine_cache_path = cache_path + ".engine";
|
||||
std::string timing_cache_path = "";
|
||||
if (timing_cache_enable_) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
|
||||
timing_cache_path = GetTimingCachePath(cache_path_, prop);
|
||||
}
|
||||
{
|
||||
// ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
|
||||
auto lock = GetApiLock();
|
||||
|
|
@ -1419,12 +1469,36 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
}
|
||||
}
|
||||
|
||||
// Load timing cache from file. Create a fresh cache if the file doesn't exist
|
||||
std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
|
||||
if (timing_cache_enable_) {
|
||||
std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
|
||||
timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
|
||||
if (timing_cache == nullptr) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
|
||||
"TensorRT EP could not create timing cache: " + timing_cache_path);
|
||||
}
|
||||
trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
|
||||
if (detailed_build_log_) {
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
|
||||
}
|
||||
}
|
||||
|
||||
// Build engine
|
||||
std::chrono::steady_clock::time_point engine_build_start;
|
||||
if (detailed_build_log_) {
|
||||
engine_build_start = std::chrono::steady_clock::now();
|
||||
}
|
||||
trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(trt_builder->buildEngineWithConfig(*trt_network, *trt_config));
|
||||
if (trt_engine == nullptr) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
|
||||
"TensorRT EP could not build engine for fused node: " + fused_node.Name());
|
||||
}
|
||||
if (detailed_build_log_) {
|
||||
auto engine_build_stop = std::chrono::steady_clock::now();
|
||||
LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " <<
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
|
||||
}
|
||||
if (engine_cache_enable_) {
|
||||
std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
|
||||
size_t engine_size = serializedModel->size();
|
||||
|
|
@ -1438,7 +1512,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
|
||||
file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
|
||||
}
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
|
||||
}
|
||||
// serialize and save timing cache
|
||||
if (timing_cache_enable_) {
|
||||
auto timing_cache = trt_config->getTimingCache();
|
||||
std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
|
||||
if (timingCacheHostData == nullptr) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
|
||||
"TensorRT EP could not serialize timing cache: " + timing_cache_path);
|
||||
}
|
||||
saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
|
||||
if (detailed_build_log_) {
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1504,7 +1591,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
|
||||
dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
|
||||
runtime_.get(), nullptr, allocator_, context_memory_sharing_enable_, &max_ctx_mem_size_, &context_memory_,
|
||||
dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_};
|
||||
dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
|
||||
force_timing_cache_match_, detailed_build_log_};
|
||||
*state = p.release();
|
||||
return 0;
|
||||
};
|
||||
|
|
@ -1545,6 +1633,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
|
||||
const std::string engine_cache_path = cache_path + ".engine";
|
||||
const std::string profile_cache_path = cache_path + ".profile";
|
||||
std::string timing_cache_path = "";
|
||||
if (timing_cache_enable_) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
|
||||
timing_cache_path = GetTimingCachePath(cache_path_, prop);
|
||||
}
|
||||
if (trt_state->engine_cache_enable && trt_engine == nullptr) {
|
||||
std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
|
||||
std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
|
||||
|
|
@ -1779,11 +1873,35 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
trt_config->setDLACore(trt_state->dla_core);
|
||||
}
|
||||
|
||||
// Load timing cache from file. Create a fresh cache if the file doesn't exist
|
||||
std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
|
||||
if (trt_state->timing_cache_enable) {
|
||||
std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
|
||||
timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
|
||||
if (timing_cache == nullptr) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
|
||||
"TensorRT EP could not create timing cache: " + timing_cache_path);
|
||||
}
|
||||
trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
|
||||
if (detailed_build_log_) {
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
|
||||
}
|
||||
}
|
||||
|
||||
// Build engine
|
||||
{
|
||||
auto lock = GetApiLock();
|
||||
std::chrono::steady_clock::time_point engine_build_start;
|
||||
if (detailed_build_log_) {
|
||||
engine_build_start = std::chrono::steady_clock::now();
|
||||
}
|
||||
*(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
|
||||
trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
|
||||
if (detailed_build_log_) {
|
||||
auto engine_build_stop = std::chrono::steady_clock::now();
|
||||
LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " <<
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
|
||||
}
|
||||
}
|
||||
if (trt_state->engine == nullptr) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
|
||||
|
|
@ -1809,6 +1927,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
}
|
||||
}
|
||||
|
||||
// serialize and save timing cache
|
||||
if (trt_state->timing_cache_enable) {
|
||||
auto timing_cache = trt_config->getTimingCache();
|
||||
std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
|
||||
if (timingCacheHostData == nullptr) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
|
||||
"TensorRT EP could not serialize timing cache: " + timing_cache_path);
|
||||
}
|
||||
saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
|
||||
if (detailed_build_log_) {
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
|
||||
}
|
||||
}
|
||||
|
||||
// Build context
|
||||
if (trt_state->context_memory_sharing_enable) {
|
||||
*(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
|
||||
|
|
|
|||
|
|
@ -30,6 +30,9 @@ static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LI
|
|||
static const std::string kForceSequentialEngineBuild = "ORT_TENSORRT_FORCE_SEQUENTIAL_ENGINE_BUILD";
|
||||
static const std::string kContextMemorySharingEnable = "ORT_TENSORRT_CONTEXT_MEMORY_SHARING_ENABLE";
|
||||
static const std::string kLayerNormFP32Fallback = "ORT_TENSORRT_LAYER_NORM_FP32_FALLBACK";
|
||||
static const std::string kTimingCacheEnable = "ORT_TENSORRT_TIMING_CACHE_ENABLE";
|
||||
static const std::string kForceTimingCache = "ORT_TENSORRT_FORCE_TIMING_CACHE_ENABLE";
|
||||
static const std::string kDetailedBuildLog = "ORT_TENSORRT_DETAILED_BUILD_LOG_ENABLE";
|
||||
// Old env variable for backward compatibility
|
||||
static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
|
||||
} // namespace tensorrt_env_vars
|
||||
|
|
@ -114,6 +117,9 @@ struct TensorrtFuncState {
|
|||
bool engine_decryption_enable = false;
|
||||
int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
|
||||
int (*engine_encryption)(const char*, char*, size_t) = nullptr;
|
||||
bool timing_cache_enable = true;
|
||||
bool force_timing_cache = false;
|
||||
bool detailed_build_log = false;
|
||||
};
|
||||
|
||||
// Logical device representation.
|
||||
|
|
@ -176,6 +182,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
|
|||
bool engine_decryption_enable_ = false;
|
||||
int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
|
||||
int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
|
||||
bool timing_cache_enable_ = false;
|
||||
bool force_timing_cache_match_ = false;
|
||||
bool detailed_build_log_ = false;
|
||||
|
||||
std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
|
||||
std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;
|
||||
|
|
|
|||
|
|
@ -27,11 +27,14 @@ constexpr const char* kCachePath = "trt_engine_cache_path";
|
|||
constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable";
|
||||
constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path";
|
||||
constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine_build";
|
||||
// add new provider option name here.
|
||||
// add new provider option name here.
|
||||
constexpr const char* kContextMemorySharingEnable = "trt_context_memory_sharing_enable";
|
||||
constexpr const char* kLayerNormFP32Fallback = "trt_layer_norm_fp32_fallback";
|
||||
constexpr const char* kTimingCacheEnable = "trt_timing_cache_enable";
|
||||
constexpr const char* kForceTimingCacheMatch = "trt_force_timing_cache_match";
|
||||
constexpr const char* kDetailedBuildLog = "trt_detailed_build_log";
|
||||
} // namespace provider_option_names
|
||||
} // namespace tensorrt
|
||||
} // namespace tensorrt
|
||||
|
||||
TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
|
||||
TensorrtExecutionProviderInfo info{};
|
||||
|
|
@ -57,15 +60,17 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
|
|||
.AddAssignmentToReference(tensorrt::provider_option_names::kInt8CalibTable, info.int8_calibration_table_name)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kInt8UseNativeCalibTable, info.int8_use_native_calibration_table)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kDLAEnable, info.dla_enable)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kCachePath, info.engine_cache_path)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kContextMemorySharingEnable, info.context_memory_sharing_enable)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kLayerNormFP32Fallback, info.layer_norm_fp32_fallback)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kTimingCacheEnable, info.timing_cache_enable)
|
||||
.AddAssignmentToReference(tensorrt::provider_option_names::kForceTimingCacheMatch, info.force_timing_cache)
|
||||
.Parse(options)); // add new provider option here.
|
||||
|
||||
return info;
|
||||
|
|
@ -93,6 +98,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
|
|||
// add new provider option here.
|
||||
{tensorrt::provider_option_names::kContextMemorySharingEnable, MakeStringWithClassicLocale(info.context_memory_sharing_enable)},
|
||||
{tensorrt::provider_option_names::kLayerNormFP32Fallback, MakeStringWithClassicLocale(info.layer_norm_fp32_fallback)},
|
||||
{tensorrt::provider_option_names::kTimingCacheEnable, MakeStringWithClassicLocale(info.timing_cache_enable)},
|
||||
};
|
||||
return options;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,10 +17,10 @@ struct TensorrtExecutionProviderInfo {
|
|||
void* user_compute_stream{nullptr};
|
||||
bool has_trt_options{false};
|
||||
int max_partition_iterations{1000};
|
||||
int min_subgraph_size{1};
|
||||
int min_subgraph_size{1};
|
||||
size_t max_workspace_size{1 << 30};
|
||||
bool fp16_enable{false};
|
||||
bool int8_enable{false};
|
||||
bool int8_enable{false};
|
||||
std::string int8_calibration_table_name{""};
|
||||
bool int8_use_native_calibration_table{false};
|
||||
bool dla_enable{false};
|
||||
|
|
@ -33,6 +33,9 @@ struct TensorrtExecutionProviderInfo {
|
|||
bool force_sequential_engine_build{false};
|
||||
bool context_memory_sharing_enable{false};
|
||||
bool layer_norm_fp32_fallback{false};
|
||||
bool timing_cache_enable{false};
|
||||
bool force_timing_cache{false};
|
||||
bool detailed_build_log{false};
|
||||
|
||||
static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
|
||||
static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);
|
||||
|
|
|
|||
|
|
@ -168,6 +168,17 @@ std::string GetCachePath(const std::string& root, const std::string& name) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get Timing by compute capability
|
||||
*
|
||||
*/
|
||||
std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) {
|
||||
// append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
|
||||
const std::string timing_cache_name = "TensorrtExecutionProvider_cache_cc" +
|
||||
std::to_string(prop.major * 10 + prop.minor) + ".timing";
|
||||
return GetCachePath(root, timing_cache_name);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get cache by type
|
||||
*
|
||||
|
|
|
|||
|
|
@ -70,6 +70,8 @@ struct Tensorrt_Provider : Provider {
|
|||
info.force_sequential_engine_build = options.trt_force_sequential_engine_build != 0;
|
||||
info.context_memory_sharing_enable = options.trt_context_memory_sharing_enable != 0;
|
||||
info.layer_norm_fp32_fallback = options.trt_layer_norm_fp32_fallback != 0;
|
||||
info.timing_cache_enable = options.trt_timing_cache_enable != 0;
|
||||
info.detailed_build_log = options.trt_detailed_build_log != 0;
|
||||
return std::make_shared<TensorrtProviderFactory>(info);
|
||||
}
|
||||
|
||||
|
|
@ -137,6 +139,8 @@ struct Tensorrt_Provider : Provider {
|
|||
trt_options.trt_force_sequential_engine_build = internal_options.force_sequential_engine_build;
|
||||
trt_options.trt_context_memory_sharing_enable = internal_options.context_memory_sharing_enable;
|
||||
trt_options.trt_layer_norm_fp32_fallback = internal_options.layer_norm_fp32_fallback;
|
||||
trt_options.trt_timing_cache_enable = internal_options.timing_cache_enable;
|
||||
trt_options.trt_force_timing_cache = internal_options.force_timing_cache;
|
||||
}
|
||||
|
||||
ProviderOptions GetProviderOptions(const void* provider_options) override {
|
||||
|
|
|
|||
|
|
@ -1274,7 +1274,10 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
|
|||
trt_options_converted.trt_engine_decryption_lib_path = legacy_trt_options->trt_engine_decryption_lib_path;
|
||||
trt_options_converted.trt_force_sequential_engine_build = legacy_trt_options->trt_force_sequential_engine_build;
|
||||
// Add new provider option below
|
||||
// Use default value as this field is not available in OrtTensorRTProviderOptionsV
|
||||
// Use default value as this field is not available in OrtTensorRTProviderOptions
|
||||
trt_options_converted.trt_timing_cache_enable = 0;
|
||||
trt_options_converted.trt_force_timing_cache = 0;
|
||||
trt_options_converted.trt_detailed_build_log = 0;
|
||||
trt_options_converted.trt_context_memory_sharing_enable = 0;
|
||||
trt_options_converted.trt_layer_norm_fp32_fallback = 0;
|
||||
return trt_options_converted;
|
||||
|
|
@ -1601,6 +1604,9 @@ ORT_API_STATUS_IMPL(OrtApis::CreateTensorRTProviderOptions, _Outptr_ OrtTensorRT
|
|||
(*out)->trt_force_sequential_engine_build = false;
|
||||
(*out)->trt_context_memory_sharing_enable = false;
|
||||
(*out)->trt_layer_norm_fp32_fallback = false;
|
||||
(*out)->trt_timing_cache_enable = false;
|
||||
(*out)->trt_force_timing_cache = false;
|
||||
(*out)->trt_detailed_build_log = false;
|
||||
return nullptr;
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(out);
|
||||
|
|
|
|||
|
|
@ -368,6 +368,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
|
|||
nullptr,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0};
|
||||
for (auto option : it->second) {
|
||||
if (option.first == "device_id") {
|
||||
|
|
|
|||
|
|
@ -121,6 +121,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
|
|||
bool trt_force_sequential_engine_build = false;
|
||||
bool trt_context_memory_sharing_enable = false;
|
||||
bool trt_layer_norm_fp32_fallback = false;
|
||||
bool trt_timing_cache_enable = false;
|
||||
bool trt_force_timing_cache = false;
|
||||
bool trt_detailed_build_log = false;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
|
||||
|
|
@ -268,6 +271,30 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
|
|||
} else {
|
||||
ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_layer_norm_fp32_fallback' should be a boolean i.e. true or false. Default value is false.\n");
|
||||
}
|
||||
} else if (key == "trt_timing_cache_enable") {
|
||||
if (value == "true" || value == "True") {
|
||||
trt_timing_cache_enable = true;
|
||||
} else if (value == "false" || value == "False") {
|
||||
trt_timing_cache_enable = false;
|
||||
} else {
|
||||
ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_timing_cache_enable' should be a boolean i.e. true or false. Default value is false.\n");
|
||||
}
|
||||
} else if (key == "trt_force_timing_cache") {
|
||||
if (value == "true" || value == "True") {
|
||||
trt_force_timing_cache = true;
|
||||
} else if (value == "false" || value == "False") {
|
||||
trt_force_timing_cache = false;
|
||||
} else {
|
||||
ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_force_timing_cache' should be a boolean i.e. true or false. Default value is false.\n");
|
||||
}
|
||||
} else if (key == "trt_detailed_build_log") {
|
||||
if (value == "true" || value == "True") {
|
||||
trt_detailed_build_log = true;
|
||||
} else if (value == "false" || value == "False") {
|
||||
trt_detailed_build_log = false;
|
||||
} else {
|
||||
ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_detailed_build_log' should be a boolean i.e. true or false. Default value is false.\n");
|
||||
}
|
||||
} else {
|
||||
ORT_THROW("[ERROR] [TensorRT] wrong key type entered. Choose from the following runtime key options that are available for TensorRT. ['device_id', 'trt_max_partition_iterations', 'trt_min_subgraph_size', 'trt_max_workspace_size', 'trt_fp16_enable', 'trt_int8_enable', 'trt_int8_calibration_table_name', 'trt_int8_use_native_calibration_table', 'trt_dla_enable', 'trt_dla_core', 'trt_dump_subgraphs', 'trt_engine_cache_enable', 'trt_engine_cache_path', 'trt_engine_decryption_enable', 'trt_engine_decryption_lib_path', 'trt_force_sequential_engine_build', 'trt_context_memory_sharing_enable', 'trt_layer_norm_fp32_fallback'] \n");
|
||||
}
|
||||
|
|
@ -293,6 +320,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
|
|||
tensorrt_options.trt_force_sequential_engine_build = trt_force_sequential_engine_build;
|
||||
tensorrt_options.trt_context_memory_sharing_enable = trt_context_memory_sharing_enable;
|
||||
tensorrt_options.trt_layer_norm_fp32_fallback = trt_layer_norm_fp32_fallback;
|
||||
tensorrt_options.trt_timing_cache_enable = trt_timing_cache_enable;
|
||||
tensorrt_options.trt_force_timing_cache = trt_force_timing_cache;
|
||||
tensorrt_options.trt_detailed_build_log = trt_detailed_build_log;
|
||||
session_options.AppendExecutionProvider_TensorRT_V2(tensorrt_options);
|
||||
|
||||
OrtCUDAProviderOptions cuda_options;
|
||||
|
|
|
|||
|
|
@ -114,7 +114,9 @@ Status PerformanceRunner::Run() {
|
|||
}
|
||||
|
||||
// warm up
|
||||
initial_inference_result_.start = std::chrono::high_resolution_clock::now();
|
||||
ORT_RETURN_IF_ERROR(RunOneIteration<true>());
|
||||
initial_inference_result_.end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// TODO: start profiling
|
||||
// if (!performance_test_config_.run_config.profile_file.empty())
|
||||
|
|
@ -139,9 +141,12 @@ Status PerformanceRunner::Run() {
|
|||
std::chrono::duration<double> session_create_duration = session_create_end_ - session_create_start_;
|
||||
// TODO: end profiling
|
||||
// if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
|
||||
auto first_inference_duration =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(initial_inference_result_.end - initial_inference_result_.start).count();
|
||||
std::chrono::duration<double> inference_duration = performance_result_.end - performance_result_.start;
|
||||
|
||||
std::cout << "Session creation time cost: " << session_create_duration.count() << " s\n"
|
||||
<< "First inference time cost: " << first_inference_duration << " ms\n"
|
||||
<< "Total inference time cost: " << performance_result_.total_time_cost << " s\n" // sum of time taken by each request
|
||||
<< "Total inference requests: " << performance_result_.time_costs.size() << "\n"
|
||||
<< "Average inference time cost: " << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms\n"
|
||||
|
|
|
|||
|
|
@ -106,6 +106,7 @@ class PerformanceRunner {
|
|||
private:
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> session_create_start_;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> session_create_end_;
|
||||
PerformanceResult initial_inference_result_;
|
||||
PerformanceResult performance_result_;
|
||||
PerformanceTestConfig performance_test_config_;
|
||||
std::unique_ptr<TestModelInfo> test_model_info_;
|
||||
|
|
|
|||
|
|
@ -683,7 +683,7 @@ TEST_P(ModelTest, Run) {
|
|||
if (test_case_name.find(ORT_TSTR("FLOAT16")) != std::string::npos) {
|
||||
OrtTensorRTProviderOptionsV2 params{0, 0, nullptr, 1000, 1, 1 << 30,
|
||||
1, // enable fp16
|
||||
0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0};
|
||||
0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0, 0, 0, 0};
|
||||
ortso.AppendExecutionProvider_TensorRT_V2(params);
|
||||
} else {
|
||||
OrtTensorRTProviderOptionsV2* ep_option = nullptr;
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@
|
|||
#include "core/providers/tensorrt/tensorrt_execution_provider_utils.h"
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <filesystem>
|
||||
#include <chrono>
|
||||
|
||||
using namespace std;
|
||||
using namespace ONNX_NAMESPACE;
|
||||
|
|
@ -151,6 +153,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
|
|||
nullptr,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0};
|
||||
|
||||
params.trt_engine_cache_enable = 1;
|
||||
|
|
@ -222,6 +227,9 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string
|
|||
nullptr,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0};
|
||||
|
||||
params.trt_engine_cache_enable = 1;
|
||||
|
|
@ -386,6 +394,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
|
|||
nullptr,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0};
|
||||
|
||||
if (cache_type.compare("engine") == 0) {
|
||||
|
|
@ -498,9 +509,68 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
|
|||
}
|
||||
} else if (cache_type.compare("timing") == 0) {
|
||||
// add test code here
|
||||
|
||||
/* Following code block tests the functionality of engine and optimization profile of ORT TRT, including:
|
||||
* - timing cache cache serialization/de-serialization
|
||||
* - benefir of usign a timing cache no matter if dynamic / static input
|
||||
*/
|
||||
uint64_t compilation_without_cache_ms, compilation_with_cache_ms;
|
||||
|
||||
params.trt_timing_cache_enable = 1;
|
||||
// std::chrono
|
||||
{
|
||||
auto start = chrono::steady_clock::now();
|
||||
std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(¶ms);
|
||||
EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
|
||||
auto status = session_object.Load(model_name);
|
||||
ASSERT_TRUE(status.IsOK());
|
||||
status = session_object.Initialize();
|
||||
ASSERT_TRUE(status.IsOK());
|
||||
|
||||
// run inference
|
||||
// TRT timing cache should be created under the situation of non-dynamic/dynamic shape input
|
||||
status = session_object.Run(run_options, feeds, output_names, &fetches);
|
||||
auto end = chrono::steady_clock::now();
|
||||
ASSERT_TRUE(status.IsOK());
|
||||
VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
|
||||
ASSERT_TRUE(IsCacheExistedByType("./", ".timing"));
|
||||
compilation_without_cache_ms = chrono::duration_cast<chrono::microseconds>(end - start).count();
|
||||
}
|
||||
|
||||
// get new session and reinitialize model
|
||||
// second same inference should resuse the cache and therefore have a faster build
|
||||
if (input_type.compare("static") == 0) {
|
||||
{
|
||||
InferenceSession session_object_new{so, GetEnvironment()};
|
||||
{
|
||||
auto start = chrono::steady_clock::now();
|
||||
std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(¶ms);
|
||||
EXPECT_TRUE(session_object_new.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
|
||||
auto status = session_object_new.Load(model_name);
|
||||
ASSERT_TRUE(status.IsOK());
|
||||
status = session_object_new.Initialize();
|
||||
ASSERT_TRUE(status.IsOK());
|
||||
|
||||
// run inference
|
||||
// TRT timing cache should be created under the situation of non-dynamic/dynamic shape input
|
||||
status = session_object_new.Run(run_options, feeds, output_names, &fetches);
|
||||
// TODO narrow down actual compilation section
|
||||
auto end = chrono::steady_clock::now();
|
||||
|
||||
ASSERT_TRUE(status.IsOK());
|
||||
VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
|
||||
ASSERT_TRUE(IsCacheExistedByType("./", ".timing"));
|
||||
compilation_with_cache_ms = chrono::duration_cast<chrono::microseconds>(end - start).count();
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(compilation_with_cache_ms <= compilation_without_cache_ms);
|
||||
} else {
|
||||
// TODO test dynamic shapes
|
||||
}
|
||||
}
|
||||
|
||||
// clean up caches
|
||||
RemoveCachesByType("./", ".timing");
|
||||
RemoveCachesByType("./", ".engine");
|
||||
RemoveCachesByType("./", ".profile");
|
||||
}
|
||||
|
|
@ -515,11 +585,13 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
|
|||
* We have following test parameters:
|
||||
* - engine_static: engine cache enabled with non-dynamic input shape
|
||||
* - engine_dynamic: engine cache enabled with dynamic input shape
|
||||
* - timing_static: will be added
|
||||
* - timing_dynamic: will be added
|
||||
* - timing_static: timing cache enabled, static input shape
|
||||
* - timing_dynamic: timing cache enabled, static input shape
|
||||
*/
|
||||
INSTANTIATE_TEST_SUITE_P(TensorrtExecutionProviderCacheTests, TensorrtExecutionProviderCacheTest, testing::Values("engine_static",
|
||||
"engine_dynamic"),
|
||||
"engine_dynamic",
|
||||
"timing_static",
|
||||
"timing_dynamic"),
|
||||
[](const ::testing::TestParamInfo<TensorrtExecutionProviderCacheTest::ParamType>& info) {return info.param;});
|
||||
|
||||
TEST(TensorrtExecutionProviderTest, FunctionTest) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue