mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
[TensorRT EP] No workspace size limit to TRT memory pool (#21643)
We saw some models failed to run due to OOM and can be fixed by increase trt_max_workspace_size. This PR makes no size limitation by default (max device memory) which is aligned with trtexec.
This commit is contained in:
parent
eeef0c8aca
commit
2abebb2a47
4 changed files with 10 additions and 11 deletions
|
|
@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 {
|
|||
// can be updated using: UpdateTensorRTProviderOptionsWithValue
|
||||
int trt_max_partition_iterations{1000}; // maximum iterations for TensorRT parser to get capability
|
||||
int trt_min_subgraph_size{1}; // minimum size of TensorRT subgraphs
|
||||
size_t trt_max_workspace_size{1 << 30}; // maximum workspace size for TensorRT.
|
||||
size_t trt_max_workspace_size{0}; // maximum workspace size for TensorRT. Default is 0 means max device memory size
|
||||
int trt_fp16_enable{0}; // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
|
||||
int trt_int8_enable{0}; // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
|
||||
const char* trt_int8_calibration_table_name{nullptr}; // TensorRT INT8 calibration table name.
|
||||
|
|
|
|||
|
|
@ -1583,10 +1583,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
|
||||
min_subgraph_size_ = 1;
|
||||
}
|
||||
if (max_workspace_size_ <= 0) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
|
||||
max_workspace_size_ = 1 << 30;
|
||||
}
|
||||
if (dla_core_ < 0) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
|
||||
dla_core_ = 0;
|
||||
|
|
@ -2756,7 +2752,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
|
|||
auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
|
||||
auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
|
||||
trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
|
||||
trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
|
||||
if (max_workspace_size_ > 0) {
|
||||
trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
|
||||
}
|
||||
|
||||
// Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow
|
||||
if (fp16_enable_ && layer_norm_fp32_fallback_) {
|
||||
|
|
@ -3363,7 +3361,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
|
|||
&parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
|
||||
&networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
|
||||
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
|
||||
dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision,
|
||||
dla_enable_, dla_core_, trt_node_name_with_precision,
|
||||
engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name],
|
||||
context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_,
|
||||
engine_decryption_, engine_encryption_, timing_cache_enable_, global_cache_path_, force_timing_cache_match_,
|
||||
|
|
@ -3538,7 +3536,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
|
|||
trt_state->context->reset();
|
||||
trt_state->engine->reset();
|
||||
auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
|
||||
trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr));
|
||||
if (max_workspace_size_ > 0) {
|
||||
trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
|
||||
}
|
||||
for (auto trt_profile : trt_profiles) {
|
||||
trt_config->addOptimizationProfile(trt_profile);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -175,7 +175,6 @@ struct TensorrtFuncState {
|
|||
bool int8_calibration_cache_available = false;
|
||||
bool dla_enable = false;
|
||||
int dla_core = 0;
|
||||
size_t* max_workspace_size_ptr = nullptr;
|
||||
std::string trt_node_name_with_precision;
|
||||
bool engine_cache_enable = false;
|
||||
std::string engine_cache_path;
|
||||
|
|
@ -290,7 +289,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
|
|||
cudaStream_t stream_ = nullptr;
|
||||
int max_partition_iterations_ = 1000;
|
||||
size_t min_subgraph_size_ = 1;
|
||||
size_t max_workspace_size_ = 1 << 30; // 1GB
|
||||
size_t max_workspace_size_ = 0;
|
||||
bool fp16_enable_ = false;
|
||||
bool int8_enable_ = false;
|
||||
bool dla_enable_ = false;
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ struct TensorrtExecutionProviderInfo {
|
|||
bool has_trt_options{false};
|
||||
int max_partition_iterations{1000};
|
||||
int min_subgraph_size{1};
|
||||
size_t max_workspace_size{1 << 30};
|
||||
size_t max_workspace_size{0};
|
||||
bool fp16_enable{false};
|
||||
bool int8_enable{false};
|
||||
std::string int8_calibration_table_name{""};
|
||||
|
|
|
|||
Loading…
Reference in a new issue