[TensorRT EP] No workspace size limit to TRT memory pool (#21643)

We saw some models failed to run due to OOM and can be fixed by increase
trt_max_workspace_size.
This PR makes no size limitation by default (max device memory) which is
aligned with trtexec.
This commit is contained in:
Chi Lo 2024-08-09 17:30:51 -07:00 committed by GitHub
parent eeef0c8aca
commit 2abebb2a47
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 10 additions and 11 deletions

View file

@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 {
// can be updated using: UpdateTensorRTProviderOptionsWithValue
int trt_max_partition_iterations{1000}; // maximum iterations for TensorRT parser to get capability
int trt_min_subgraph_size{1}; // minimum size of TensorRT subgraphs
size_t trt_max_workspace_size{1 << 30}; // maximum workspace size for TensorRT.
size_t trt_max_workspace_size{0}; // maximum workspace size for TensorRT. Default is 0 means max device memory size
int trt_fp16_enable{0}; // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
int trt_int8_enable{0}; // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
const char* trt_int8_calibration_table_name{nullptr}; // TensorRT INT8 calibration table name.

View file

@ -1583,10 +1583,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
min_subgraph_size_ = 1;
}
if (max_workspace_size_ <= 0) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
max_workspace_size_ = 1 << 30;
}
if (dla_core_ < 0) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
dla_core_ = 0;
@ -2756,7 +2752,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
if (max_workspace_size_ > 0) {
trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
}
// Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow
if (fp16_enable_ && layer_norm_fp32_fallback_) {
@ -3363,7 +3361,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
&parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
&networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision,
dla_enable_, dla_core_, trt_node_name_with_precision,
engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name],
context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_,
engine_decryption_, engine_encryption_, timing_cache_enable_, global_cache_path_, force_timing_cache_match_,
@ -3538,7 +3536,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
trt_state->context->reset();
trt_state->engine->reset();
auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr));
if (max_workspace_size_ > 0) {
trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
}
for (auto trt_profile : trt_profiles) {
trt_config->addOptimizationProfile(trt_profile);
}

View file

@ -175,7 +175,6 @@ struct TensorrtFuncState {
bool int8_calibration_cache_available = false;
bool dla_enable = false;
int dla_core = 0;
size_t* max_workspace_size_ptr = nullptr;
std::string trt_node_name_with_precision;
bool engine_cache_enable = false;
std::string engine_cache_path;
@ -290,7 +289,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
cudaStream_t stream_ = nullptr;
int max_partition_iterations_ = 1000;
size_t min_subgraph_size_ = 1;
size_t max_workspace_size_ = 1 << 30; // 1GB
size_t max_workspace_size_ = 0;
bool fp16_enable_ = false;
bool int8_enable_ = false;
bool dla_enable_ = false;

View file

@ -22,7 +22,7 @@ struct TensorrtExecutionProviderInfo {
bool has_trt_options{false};
int max_partition_iterations{1000};
int min_subgraph_size{1};
size_t max_workspace_size{1 << 30};
size_t max_workspace_size{0};
bool fp16_enable{false};
bool int8_enable{false};
std::string int8_calibration_table_name{""};