From fbe88fccbde56b551ec0fbe5d7616437ae9a07c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <44298237+gedoensmax@users.noreply.github.com> Date: Fri, 14 Apr 2023 18:47:36 +0200 Subject: [PATCH] Exposing new TRT build options (#15089) ### Description This will add a few TRT options, some of them are only available on TRT 8.6: - heuristics - sparsity - optimization level (8.6 only) - auxiliary stream (8.6 only) - tactic source selection I am no sure yet which tests is should add for these options. As those are mostly simple TRT flags i am not sure to what level i should test. For heuristics something similar to https://github.com/gedoensmax/onnxruntime/blob/44dda08b51e98bd86f6eafd09b7d00c3680c78a8/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc#L510-L538 should be possible for, but for all other essentially we would only be testing if there is a crash or not if the option is set. Also if i forgot some option that would be good to have feel free to speak up ! --- .../tensorrt/tensorrt_provider_options.h | 6 + .../core/session/onnxruntime_c_api.h | 2 +- .../tensorrt/tensorrt_execution_provider.cc | 184 +++++++++++++++++- .../tensorrt/tensorrt_execution_provider.h | 16 ++ .../tensorrt_execution_provider_info.cc | 29 ++- .../tensorrt_execution_provider_info.h | 7 +- .../tensorrt/tensorrt_provider_factory.cc | 25 ++- .../core/session/provider_bridge_ort.cc | 13 +- .../python/onnxruntime_pybind_state.cc | 7 +- onnxruntime/test/perftest/ort_test_session.cc | 46 ++++- onnxruntime/test/providers/cpu/model_tests.cc | 3 +- .../providers/tensorrt/tensorrt_basic_test.cc | 21 +- onnxruntime/test/shared_lib/test_inference.cc | 24 ++- 13 files changed, 363 insertions(+), 20 deletions(-) diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h index ddf390db3d..8fc06cf2c2 100644 --- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h +++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h @@ -34,4 +34,10 @@ struct OrtTensorRTProviderOptionsV2 { int trt_timing_cache_enable; // enable TensorRT timing cache. Default 0 = false, nonzero = true int trt_force_timing_cache; // force the TensorRT cache to be used even if device profile does not match. Default 0 = false, nonzero = true int trt_detailed_build_log; // Enable detailed build step logging on TensorRT EP with timing for each engine build. Default 0 = false, nonzero = true + int trt_build_heuristics_enable; // Build engine using heuristics to reduce build time. Default 0 = false, nonzero = true + int trt_sparsity_enable; // Control if sparsity can be used by TRT. Default 0 = false, 1 = true + int trt_builder_optimization_level; // Set the builder optimization level. WARNING: levels below 2 do not guarantee good engine performance, but greatly improve build time. Default 2, valid range [0-4] + int trt_auxiliary_streams; // Set maximum number of auxiliary streams per inference stream. Setting this value to 0 will lead to optimal memory usage. Default -1 = heuristics + const char* trt_tactic_sources; // pecify the tactics to be used by adding (+) or removing (-) tactics from the default + // tactic sources (default = all available tactics) e.g. "-CUDNN,+CUBLAS" available keys: "CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS" }; diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index b6e313bd20..7ce75d4622 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -2873,7 +2873,7 @@ struct OrtApi { * * For example, "trt_max_workspace_size=2147483648;trt_max_partition_iterations=10;trt_int8_enable=1;......" * - * \param tensorrt_options - OrTensorRTProviderOptionsV2 instance + * \param tensorrt_options - OrtTensorRTProviderOptionsV2 instance * \param allocator - a ptr to an instance of OrtAllocator obtained with OrtApi::CreateAllocator or OrtApi::GetAllocatorWithDefaultOptions * the specified allocator will be used to allocate continuous buffers for output strings and lengths. * \param ptr - is a UTF-8 null terminated string allocated using 'allocator'. The caller is responsible for using the same allocator to free it. diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 6502719eaa..7cf7898d2d 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -118,6 +118,67 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map SplitToStringVec(std::string const& s, char separator) { + std::vector splitted; + + for (size_t start = 0; start < s.length();) { + size_t separatorIndex = s.find(separator, start); + if (separatorIndex == std::string::npos) { + separatorIndex = s.length(); + } + splitted.emplace_back(s.substr(start, separatorIndex - start)); + start = separatorIndex + 1; + } + + return splitted; +} + +nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_sting) { + nvinfer1::TacticSources disabledTactics = 0; + nvinfer1::TacticSources enabledTactics = 0; + std::vector tacticList = SplitToStringVec(tactic_sting, ','); + for (auto& t : tacticList) { + bool enable{false}; + if (t.front() == '+') { + enable = true; + } else if (t.front() != '-') { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Tactic source must be prefixed with + or - skipping: " << t; + } + t.erase(0, 1); + + const auto toUpper = [](std::string& sourceName) { + std::transform( + sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); }); + return sourceName; + }; + + nvinfer1::TacticSource source{}; + t = toUpper(t); + if (t == "CUBLAS") { + source = nvinfer1::TacticSource::kCUBLAS; + } else if (t == "CUBLASLT" || t == "CUBLAS_LT") { + source = nvinfer1::TacticSource::kCUBLAS_LT; + } else if (t == "CUDNN") { + source = nvinfer1::TacticSource::kCUDNN; + } else if (t == "EDGE_MASK_CONVOLUTIONS") { + source = nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS; + } else if (t == "JIT_CONVOLUTIONS") { + source = nvinfer1::TacticSource::kJIT_CONVOLUTIONS; + } else { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Tactic source was not found with name: " << t; + } + + uint32_t sourceBit = 1U << static_cast(source); + + if (enable) { + enabledTactics |= sourceBit; + } else { + disabledTactics |= sourceBit; + } + } + return enabledTactics & ~disabledTactics; +} + inline std::vector loadTimingCacheFile(const std::string inFileName) { std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); if (!iFile) { @@ -353,6 +414,11 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv if (fp16_enable_) { layer_norm_fp32_fallback_ = info.layer_norm_fp32_fallback; } + build_heuristics_enable_ = info.build_heuristics_enable; + sparsity_enable_ = info.sparsity_enable; + builder_optimization_level_ = info.builder_optimization_level; + auxiliary_streams_ = info.auxiliary_streams; + tactic_sources_ = info.tactic_sources; } else { const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations); if (!max_partition_iterations_env.empty()) { @@ -462,6 +528,31 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv if (!layer_norm_fp32_fallback_env.empty()) { layer_norm_fp32_fallback_ = (std::stoi(layer_norm_fp32_fallback_env) == 0 ? false : true); } + + const std::string build_heuristics_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kBuildHeuristics); + if (!build_heuristics_env.empty()) { + build_heuristics_enable_ = (std::stoi(build_heuristics_env) == 0 ? false : true); + } + + const std::string sparsity_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kSparsityEnable); + if (!sparsity_enable_env.empty()) { + sparsity_enable_ = (std::stoi(sparsity_enable_env) == 0 ? false : true); + } + + const std::string builder_optimization_level_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kBuilderOptimizationLevel); + if (!builder_optimization_level_env.empty()) { + builder_optimization_level_ = std::stoi(builder_optimization_level_env); + } + + const std::string auxiliary_streams_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kAuxiliaryStreams); + if (!auxiliary_streams_env.empty()) { + auxiliary_streams_ = std::stoi(auxiliary_streams_env); + } + + const std::string tactic_sources_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTacticSources); + if (!tactic_sources_env.empty()) { + tactic_sources_ = tactic_sources_env; + } } // Validate setting @@ -527,7 +618,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv << ", trt_engine_decryption_lib_path: " << engine_decryption_lib_path_ << ", trt_force_sequential_engine_build: " << force_sequential_engine_build_ << ", trt_context_memory_sharing_enable: " << context_memory_sharing_enable_ - << ", trt_layer_norm_fp32_fallback: " << layer_norm_fp32_fallback_; + << ", trt_layer_norm_fp32_fallback: " << layer_norm_fp32_fallback_ + << ", trt_build_heuristics_enable: " << build_heuristics_enable_ + << ", trt_sparsity_enable: " << sparsity_enable_ + << ", trt_builder_optimization_level: " << builder_optimization_level_ + << ", trt_auxiliary_streams: " << auxiliary_streams_ + << ", trt_tactic_sources: " << tactic_sources_; } TensorrtExecutionProvider::~TensorrtExecutionProvider() { @@ -1410,6 +1506,45 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorsetFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed"; + } + + // enable builder heuristics + if (build_heuristics_enable_) { + trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC ); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled"; + } +#if NV_TENSORRT_MINOR > 5 && NV_TENSORRT_MAJOR >= 8 + // switch optimizaion level + if (builder_optimization_level_ != 2) { + trt_config->setBuilderOptimizationLevel(builder_optimization_level_); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_; + } + + // limit auxiliary streams + if (auxiliary_streams_ >= 0) { + trt_config->setMaxAuxStreams(auxiliary_streams_); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << auxiliary_streams_; + } +#else + if (builder_optimization_level_ != 2) { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!"; + } + if (auxiliary_streams_ >= 0) { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!"; + } +#endif + // limit used tactic sources + if (!tactic_sources_.empty()) { + nvinfer1::TacticSources tactics = trt_config->getTacticSources(); + tactics |= GetTacticSourceFromString(tactic_sources_); + trt_config->setTacticSources(tactics); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using " << tactic_sources_; + } + // Build TRT engine here if the graph doesn't have dynamic shape input. Otherwise engine will // be built at runtime std::unique_ptr trt_engine; @@ -1584,6 +1719,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector p = std::make_unique(); + // translate tactic sources string to nvinfer1::TacticSources + nvinfer1::TacticSources tactics = 0; + if (!tactic_sources_.empty()) { + tactics = GetTacticSourceFromString(tactic_sources_); + } *p = {context->allocate_func, context->release_func, context->allocator_handle, &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], @@ -1591,7 +1731,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorsetDLACore(trt_state->dla_core); } + // enable sparse weights + if (trt_state->sparsity_enable) { + trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed"; + } + + // enable builder heuristics + if (trt_state->build_heuristics_enable) { + trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC ); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled"; + } +#if NV_TENSORRT_MINOR > 5 && NV_TENSORRT_MAJOR >= 8 + // switch optimizaion level + if (trt_state->builder_optimization_level != 2) { + trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_; + } + + // limit auxiliary streams + if (trt_state->auxiliary_streams >= 0) { + trt_config->setMaxAuxStreams(trt_state->auxiliary_streams); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams; + } +#else + if (trt_state->builder_optimization_level != 2) { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!"; + } + if (trt_state->auxiliary_streams >= 0) { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!"; + } +#endif + // limit used tactic sources + if (trt_state->filter_tactic_sources) { + nvinfer1::TacticSources tactics = trt_config->getTacticSources(); + tactics |= trt_state->tactic_sources; + trt_config->setTacticSources(tactics); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics; + } + // Load timing cache from file. Create a fresh cache if the file doesn't exist std::unique_ptr timing_cache = nullptr; if (trt_state->timing_cache_enable) { diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index cb87b31e01..26ecb1f5cf 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -33,6 +33,11 @@ static const std::string kLayerNormFP32Fallback = "ORT_TENSORRT_LAYER_NORM_FP32_ static const std::string kTimingCacheEnable = "ORT_TENSORRT_TIMING_CACHE_ENABLE"; static const std::string kForceTimingCache = "ORT_TENSORRT_FORCE_TIMING_CACHE_ENABLE"; static const std::string kDetailedBuildLog = "ORT_TENSORRT_DETAILED_BUILD_LOG_ENABLE"; +static const std::string kBuildHeuristics = "ORT_TENSORRT_BUILD_HEURISTICS_ENABLE"; +static const std::string kSparsityEnable = "ORT_TENSORRT_SPARSITY_ENABLE"; +static const std::string kBuilderOptimizationLevel = "ORT_TENSORRT_BUILDER_OPTIMIZATION_LEVEL"; +static const std::string kAuxiliaryStreams = "ORT_TENSORRT_AUXILIARY_STREAMS"; +static const std::string kTacticSources = "ORT_TENSORRT_TACTIC_SOURCES"; // Old env variable for backward compatibility static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH"; } // namespace tensorrt_env_vars @@ -120,6 +125,12 @@ struct TensorrtFuncState { bool timing_cache_enable = true; bool force_timing_cache = false; bool detailed_build_log = false; + bool build_heuristics_enable = false; + bool sparsity_enable = false; + int builder_optimization_level = 2; + int auxiliary_streams = -1; + bool filter_tactic_sources = false; + nvinfer1::TacticSources tactic_sources; }; // Logical device representation. @@ -169,6 +180,11 @@ class TensorrtExecutionProvider : public IExecutionProvider { bool int8_use_native_tensorrt_calibration_table_ = false; bool dump_subgraphs_ = false; bool engine_cache_enable_ = false; + bool build_heuristics_enable_ = false; + bool sparsity_enable_ = false; + int builder_optimization_level_ = 2; + int auxiliary_streams_ = -1; + std::string tactic_sources_; std::string cache_path_, engine_decryption_lib_path_; std::unique_ptr runtime_ = nullptr; OrtMutex tensorrt_mu_; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc index a4be09efcc..c9f3c3ffa7 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "core/providers/tensorrt/tensorrt_execution_provider_info.h" +#include "core/providers/tensorrt/tensorrt_provider_options.h" #include "core/common/make_string.h" #include "core/common/parse_string.h" @@ -33,6 +34,11 @@ constexpr const char* kLayerNormFP32Fallback = "trt_layer_norm_fp32_fallback"; constexpr const char* kTimingCacheEnable = "trt_timing_cache_enable"; constexpr const char* kForceTimingCacheMatch = "trt_force_timing_cache_match"; constexpr const char* kDetailedBuildLog = "trt_detailed_build_log"; +constexpr const char* kBuildHeuristics = "trt_build_heuristics_enable"; +constexpr const char* kSparsityEnable = "trt_sparsity_enable"; +constexpr const char* kBuilderOptimizationLevel = "trt_builder_optimization_level"; +constexpr const char* kAuxiliaryStreams = "trt_auxiliary_streams"; +constexpr const char* kTacticSources = "trt_tactic_sources"; } // namespace provider_option_names } // namespace tensorrt @@ -72,6 +78,11 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions .AddAssignmentToReference(tensorrt::provider_option_names::kTimingCacheEnable, info.timing_cache_enable) .AddAssignmentToReference(tensorrt::provider_option_names::kForceTimingCacheMatch, info.force_timing_cache) .AddAssignmentToReference(tensorrt::provider_option_names::kDetailedBuildLog, info.detailed_build_log) + .AddAssignmentToReference(tensorrt::provider_option_names::kBuildHeuristics, info.build_heuristics_enable) + .AddAssignmentToReference(tensorrt::provider_option_names::kSparsityEnable, info.sparsity_enable) + .AddAssignmentToReference(tensorrt::provider_option_names::kBuilderOptimizationLevel, info.builder_optimization_level) + .AddAssignmentToReference(tensorrt::provider_option_names::kAuxiliaryStreams, info.auxiliary_streams) + .AddAssignmentToReference(tensorrt::provider_option_names::kTacticSources, info.tactic_sources) .Parse(options)); // add new provider option here. return info; @@ -102,15 +113,21 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE {tensorrt::provider_option_names::kTimingCacheEnable, MakeStringWithClassicLocale(info.timing_cache_enable)}, {tensorrt::provider_option_names::kForceTimingCacheMatch, MakeStringWithClassicLocale(info.force_timing_cache)}, {tensorrt::provider_option_names::kDetailedBuildLog, MakeStringWithClassicLocale(info.detailed_build_log)}, + {tensorrt::provider_option_names::kBuildHeuristics, MakeStringWithClassicLocale(info.build_heuristics_enable)}, + {tensorrt::provider_option_names::kSparsityEnable, MakeStringWithClassicLocale(info.sparsity_enable)}, + {tensorrt::provider_option_names::kBuilderOptimizationLevel, MakeStringWithClassicLocale(info.builder_optimization_level)}, + {tensorrt::provider_option_names::kAuxiliaryStreams, MakeStringWithClassicLocale(info.auxiliary_streams)}, + {tensorrt::provider_option_names::kTacticSources, MakeStringWithClassicLocale(info.tactic_sources)}, }; return options; } -ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensorRTProviderOptions& info) { +ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensorRTProviderOptionsV2& info) { auto empty_if_null = [](const char* s) { return s != nullptr ? std::string{s} : std::string{}; }; const std::string kInt8CalibTable_ = empty_if_null(info.trt_int8_calibration_table_name); const std::string kCachePath_ = empty_if_null(info.trt_engine_cache_path); + const std::string kTacticSources_ = empty_if_null(info.trt_tactic_sources); const std::string kDecryptionLibPath_ = empty_if_null(info.trt_engine_decryption_lib_path); const ProviderOptions options{ @@ -130,6 +147,16 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.trt_engine_decryption_enable)}, {tensorrt::provider_option_names::kDecryptionLibPath, kDecryptionLibPath_}, {tensorrt::provider_option_names::kForceSequentialEngineBuild, MakeStringWithClassicLocale(info.trt_force_sequential_engine_build)}, + {tensorrt::provider_option_names::kContextMemorySharingEnable, MakeStringWithClassicLocale(info.trt_context_memory_sharing_enable)}, + {tensorrt::provider_option_names::kLayerNormFP32Fallback, MakeStringWithClassicLocale(info.trt_layer_norm_fp32_fallback)}, + {tensorrt::provider_option_names::kTimingCacheEnable, MakeStringWithClassicLocale(info.trt_timing_cache_enable)}, + {tensorrt::provider_option_names::kForceTimingCacheMatch, MakeStringWithClassicLocale(info.trt_force_timing_cache)}, + {tensorrt::provider_option_names::kDetailedBuildLog, MakeStringWithClassicLocale(info.trt_detailed_build_log)}, + {tensorrt::provider_option_names::kBuildHeuristics, MakeStringWithClassicLocale(info.trt_build_heuristics_enable)}, + {tensorrt::provider_option_names::kSparsityEnable, MakeStringWithClassicLocale(info.trt_sparsity_enable)}, + {tensorrt::provider_option_names::kBuilderOptimizationLevel, MakeStringWithClassicLocale(info.trt_builder_optimization_level)}, + {tensorrt::provider_option_names::kAuxiliaryStreams, MakeStringWithClassicLocale(info.trt_auxiliary_streams)}, + {tensorrt::provider_option_names::kTacticSources, kTacticSources_}, }; return options; } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h index 1a2e5e01af..262fc0854f 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h @@ -36,9 +36,14 @@ struct TensorrtExecutionProviderInfo { bool timing_cache_enable{false}; bool force_timing_cache{false}; bool detailed_build_log{false}; + bool build_heuristics_enable{false}; + bool sparsity_enable{false}; + int builder_optimization_level{2}; + int auxiliary_streams{-1}; + std::string tactic_sources{""}; static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info); - static ProviderOptions ToProviderOptions(const OrtTensorRTProviderOptions& info); + static ProviderOptions ToProviderOptions(const OrtTensorRTProviderOptionsV2& info); }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc index 2a8a481008..913f6a1287 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc @@ -73,6 +73,11 @@ struct Tensorrt_Provider : Provider { info.timing_cache_enable = options.trt_timing_cache_enable != 0; info.force_timing_cache = options.trt_force_timing_cache != 0; info.detailed_build_log = options.trt_detailed_build_log != 0; + info.build_heuristics_enable = options.trt_build_heuristics_enable != 0; + info.sparsity_enable = options.trt_sparsity_enable; + info.builder_optimization_level = options.trt_builder_optimization_level; + info.auxiliary_streams = options.trt_auxiliary_streams; + info.tactic_sources = options.trt_tactic_sources == nullptr ? "" : options.trt_tactic_sources; return std::make_shared(info); } @@ -143,10 +148,28 @@ struct Tensorrt_Provider : Provider { trt_options.trt_timing_cache_enable = internal_options.timing_cache_enable; trt_options.trt_force_timing_cache = internal_options.force_timing_cache; trt_options.trt_detailed_build_log = internal_options.detailed_build_log; + trt_options.trt_build_heuristics_enable = internal_options.build_heuristics_enable; + trt_options.trt_sparsity_enable = internal_options.sparsity_enable; + trt_options.trt_builder_optimization_level = internal_options.builder_optimization_level; + trt_options.trt_auxiliary_streams = internal_options.auxiliary_streams; + str_size = internal_options.tactic_sources.size(); + if (str_size == 0) { + trt_options.trt_tactic_sources = nullptr; + } else { + dest = new char[str_size + 1]; +#ifdef _MSC_VER + strncpy_s(dest, str_size + 1, internal_options.tactic_sources.c_str(), str_size); +#else + strncpy(dest, internal_options.tactic_sources.c_str(), str_size); +#endif + dest[str_size] = '\0'; + trt_options.trt_tactic_sources = (const char*)dest; + } + } ProviderOptions GetProviderOptions(const void* provider_options) override { - auto& options = *reinterpret_cast(provider_options); + auto& options = *reinterpret_cast(provider_options); return onnxruntime::TensorrtExecutionProviderInfo::ToProviderOptions(options); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index faa25c39f5..c0a4420436 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1280,6 +1280,11 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti trt_options_converted.trt_detailed_build_log = 0; trt_options_converted.trt_context_memory_sharing_enable = 0; trt_options_converted.trt_layer_norm_fp32_fallback = 0; + trt_options_converted.trt_build_heuristics_enable = 0; + trt_options_converted.trt_sparsity_enable = 0; + trt_options_converted.trt_builder_optimization_level = 2; + trt_options_converted.trt_auxiliary_streams = -1; + trt_options_converted.trt_tactic_sources = ""; return trt_options_converted; } @@ -1413,11 +1418,11 @@ INcclService& INcclService::GetInstance() { } // namespace rocm #endif -void UpdateProviderInfo_Tensorrt(OrtTensorRTProviderOptions* provider_options, const ProviderOptions& options) { +void UpdateProviderInfo_Tensorrt(OrtTensorRTProviderOptionsV2* provider_options, const ProviderOptions& options) { s_library_tensorrt.Get().UpdateProviderOptions(reinterpret_cast(provider_options), options); } -ProviderOptions GetProviderInfo_Tensorrt(const OrtTensorRTProviderOptions* provider_options) { +ProviderOptions GetProviderInfo_Tensorrt(const OrtTensorRTProviderOptionsV2* provider_options) { return s_library_tensorrt.Get().GetProviderOptions(reinterpret_cast(provider_options)); } @@ -1632,7 +1637,7 @@ ORT_API_STATUS_IMPL(OrtApis::UpdateTensorRTProviderOptions, provider_options_map[provider_options_keys[i]] = provider_options_values[i]; } - onnxruntime::UpdateProviderInfo_Tensorrt(reinterpret_cast(tensorrt_options), + onnxruntime::UpdateProviderInfo_Tensorrt(tensorrt_options, reinterpret_cast(provider_options_map)); return nullptr; #else @@ -1649,7 +1654,7 @@ ORT_API_STATUS_IMPL(OrtApis::GetTensorRTProviderOptionsAsString, _In_ const OrtT _Outptr_ char** ptr) { API_IMPL_BEGIN #ifdef USE_TENSORRT - onnxruntime::ProviderOptions options = onnxruntime::GetProviderInfo_Tensorrt(reinterpret_cast(tensorrt_options)); + onnxruntime::ProviderOptions options = onnxruntime::GetProviderInfo_Tensorrt(tensorrt_options); onnxruntime::ProviderOptions::iterator it = options.begin(); std::string options_str = ""; diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 9f1e098fd7..f5d2e02719 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -371,7 +371,12 @@ std::unique_ptr CreateExecutionProviderInstance( 0, 0, 0, - 0}; + 0, + 0, + 0, + 2, + -1, + nullptr}; for (auto option : it->second) { if (option.first == "device_id") { if (!option.second.empty()) { diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 5778ca5d37..d0d88dc013 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -124,6 +124,11 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device bool trt_timing_cache_enable = false; bool trt_force_timing_cache = false; bool trt_detailed_build_log = false; + bool trt_build_heuristics_enable = false; + bool trt_sparsity_enable = false; + int trt_builder_optimization_level = 2; + int trt_auxiliary_streams = -1; + std::string trt_tactic_sources = ""; #ifdef _MSC_VER std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string); @@ -295,6 +300,40 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device } else { ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_detailed_build_log' should be a boolean i.e. true or false. Default value is false.\n"); } + } else if (key == "trt_build_heuristics_enable") { + if (value == "true" || value == "True") { + trt_build_heuristics_enable = true; + } else if (value == "false" || value == "False") { + trt_build_heuristics_enable = false; + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_build_heuristics_enable' should be a boolean i.e. true or false. Default value is false.\n"); + } + } else if (key == "trt_sparsity_enable") { + if (value == "true" || value == "True") { + trt_sparsity_enable = true; + } else if (value == "false" || value == "False") { + trt_sparsity_enable = false; + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_sparsity_enable' should be a boolean i.e. true or false. Default value is false.\n"); + } + } else if (key == "trt_builder_optimization_level") { + if (!value.empty()) { + trt_builder_optimization_level = std::stoi(value); + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_builder_optimization_level' should be a number and default to 2.\n"); + } + } else if (key == "trt_auxiliary_streams") { + if (!value.empty()) { + trt_auxiliary_streams = std::stoi(value); + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_auxiliary_streams' should be a number.\n"); + } + } else if (key == "trt_tactic_sources") { + if (!value.empty()) { + trt_tactic_sources = value; + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_tactic_sources' should be a non-emtpy string.\n"); + } } else { ORT_THROW("[ERROR] [TensorRT] wrong key type entered. Choose from the following runtime key options that are available for TensorRT. ['device_id', 'trt_max_partition_iterations', 'trt_min_subgraph_size', 'trt_max_workspace_size', 'trt_fp16_enable', 'trt_int8_enable', 'trt_int8_calibration_table_name', 'trt_int8_use_native_calibration_table', 'trt_dla_enable', 'trt_dla_core', 'trt_dump_subgraphs', 'trt_engine_cache_enable', 'trt_engine_cache_path', 'trt_engine_decryption_enable', 'trt_engine_decryption_lib_path', 'trt_force_sequential_engine_build', 'trt_context_memory_sharing_enable', 'trt_layer_norm_fp32_fallback'] \n"); } @@ -323,6 +362,11 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device tensorrt_options.trt_timing_cache_enable = trt_timing_cache_enable; tensorrt_options.trt_force_timing_cache = trt_force_timing_cache; tensorrt_options.trt_detailed_build_log = trt_detailed_build_log; + tensorrt_options.trt_build_heuristics_enable = trt_build_heuristics_enable; + tensorrt_options.trt_sparsity_enable = trt_sparsity_enable; + tensorrt_options.trt_builder_optimization_level = trt_builder_optimization_level; + tensorrt_options.trt_auxiliary_streams = trt_auxiliary_streams; + tensorrt_options.trt_tactic_sources = trt_tactic_sources.c_str(); session_options.AppendExecutionProvider_TensorRT_V2(tensorrt_options); OrtCUDAProviderOptions cuda_options; @@ -473,7 +517,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device } else if (key == "rpc_control_latency") { qnn_options[key] = value; } else { - ORT_THROW(R"(Wrong key type entered. Choose from options: + ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'profiling_level', 'rpc_control_latency'])"); } } diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index 658339d603..60f5870f09 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -685,7 +685,8 @@ TEST_P(ModelTest, Run) { if (test_case_name.find(ORT_TSTR("FLOAT16")) != std::string::npos) { OrtTensorRTProviderOptionsV2 params{0, 0, nullptr, 1000, 1, 1 << 30, 1, // enable fp16 - 0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0, 0, 0, 0}; + 0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0, 0, 0, 0, 0, 0, + 2, -1, nullptr}; ortso.AppendExecutionProvider_TensorRT_V2(params); } else { OrtTensorRTProviderOptionsV2* ep_option = nullptr; diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index 2d38bf7b4b..4c7538401f 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -156,7 +156,12 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string 0, 0, 0, - 0}; + 0, + 0, + 0, + 2, + -1, + nullptr}; params.trt_engine_cache_enable = 1; std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); @@ -230,7 +235,12 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string 0, 0, 0, - 0}; + 0, + 0, + 0, + 2, + -1, + nullptr}; params.trt_engine_cache_enable = 1; std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); @@ -397,7 +407,12 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { 0, 0, 0, - 0}; + 0, + 0, + 0, + 2, + -1, + nullptr}; if (cache_type.compare("engine") == 0) { diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index c709f37674..c5c005eb95 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -2496,9 +2496,16 @@ TEST(CApiTest, ConfigureCudaArenaAndDemonstrateMemoryArenaShrinkage) { #endif #ifdef USE_TENSORRT +class CApiTensorRTTest : public testing::Test, public ::testing::WithParamInterface {}; // This test uses CreateTensorRTProviderOptions/UpdateTensorRTProviderOptions APIs to configure and create a TensorRT Execution Provider -TEST(CApiTest, TestConfigureTensorRTProviderOptions) { +TEST_P(CApiTensorRTTest, TestConfigureTensorRTProviderOptions) { + std::string param = GetParam(); + size_t pos = param.find("="); + std::string option_name = param.substr(0, pos); + std::string option_value = param.substr(pos + 1); + ASSERT_NE(pos, std::string::npos); + const auto& api = Ort::GetApi(); OrtTensorRTProviderOptionsV2* trt_options; OrtAllocator* allocator; @@ -2508,16 +2515,19 @@ TEST(CApiTest, TestConfigureTensorRTProviderOptions) { const char* engine_cache_path = "./trt_engine_folder"; - std::vector keys{"device_id", "trt_fp16_enable", "trt_int8_enable", "trt_engine_cache_enable", "trt_engine_cache_path"}; + std::vector keys{"device_id", "trt_fp16_enable", "trt_int8_enable", "trt_engine_cache_enable", + "trt_engine_cache_path", option_name.c_str()}; - std::vector values{"0", "1", "0", "1", engine_cache_path}; + std::vector values{"0", "1", "0", "1", + engine_cache_path, option_value.c_str()}; - ASSERT_TRUE(api.UpdateTensorRTProviderOptions(rel_trt_options.get(), keys.data(), values.data(), 5) == nullptr); + ASSERT_TRUE(api.UpdateTensorRTProviderOptions(rel_trt_options.get(), keys.data(), values.data(), keys.size()) == nullptr); ASSERT_TRUE(api.GetAllocatorWithDefaultOptions(&allocator) == nullptr); ASSERT_TRUE(api.GetTensorRTProviderOptionsAsString(rel_trt_options.get(), allocator, &trt_options_str) == nullptr); std::string s(trt_options_str); ASSERT_TRUE(s.find(engine_cache_path) != std::string::npos); + ASSERT_TRUE(s.find(param.c_str()) != std::string::npos); ASSERT_TRUE(api.AllocatorFree(allocator, (void*)trt_options_str) == nullptr); Ort::SessionOptions session_options; @@ -2552,6 +2562,12 @@ TEST(CApiTest, TestConfigureTensorRTProviderOptions) { struct stat buffer; ASSERT_TRUE(stat(engine_cache_path, &buffer) == 0); } + +/* + * The TensorrtExecutionProviderOptionsTest can be used to test TRT options + */ +INSTANTIATE_TEST_SUITE_P(CApiTensorRTTest, CApiTensorRTTest, + ::testing::Values("trt_build_heuristics_enable=1", "trt_sparsity_enable=1", "trt_builder_optimization_level=0", "trt_tactic_sources=-CUDNN,+CUBLAS", "trt_auxiliary_streams=2")); #endif #ifdef USE_CUDA