[TensorRT EP] No workspace size limit to TRT memory pool (#21643)

We saw some models failed to run due to OOM and can be fixed by increase trt_max_workspace_size. This PR makes no size limitation by default (max device memory) which is aligned with trtexec.
2026-07-03 03:58:54 +00:00 · 2024-08-09 17:30:51 -07:00 · 2024-08-09 17:30:51 -07:00 · 2abebb2a47
commit 2abebb2a47
parent eeef0c8aca
4 changed files with 10 additions and 11 deletions
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 {
                                                         // can be updated using: UpdateTensorRTProviderOptionsWithValue
  int trt_max_partition_iterations{1000};                // maximum iterations for TensorRT parser to get capability
  int trt_min_subgraph_size{1};                          // minimum size of TensorRT subgraphs
-  size_t trt_max_workspace_size{1 << 30};                // maximum workspace size for TensorRT.
+  size_t trt_max_workspace_size{0};                      // maximum workspace size for TensorRT. Default is 0 means max device memory size
  int trt_fp16_enable{0};                                // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
  int trt_int8_enable{0};                                // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
  const char* trt_int8_calibration_table_name{nullptr};  // TensorRT INT8 calibration table name.
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@ -1583,10 +1583,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
    LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
    min_subgraph_size_ = 1;
  }
-  if (max_workspace_size_ <= 0) {
-    LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
-    max_workspace_size_ = 1 << 30;
-  }
  if (dla_core_ < 0) {
    LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
    dla_core_ = 0;
@ -2756,7 +2752,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
  auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
  auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
  trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
-  trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+  if (max_workspace_size_ > 0) {
+    trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+  }

  // Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow
  if (fp16_enable_ && layer_norm_fp32_fallback_) {
@ -3363,7 +3361,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
          &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
          &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
          input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
-          dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision,
+          dla_enable_, dla_core_, trt_node_name_with_precision,
          engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name],
          context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_,
          engine_decryption_, engine_encryption_, timing_cache_enable_, global_cache_path_, force_timing_cache_match_,
@ -3538,7 +3536,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
      trt_state->context->reset();
      trt_state->engine->reset();
      auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-      trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr));
+      if (max_workspace_size_ > 0) {
+        trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+      }
      for (auto trt_profile : trt_profiles) {
        trt_config->addOptimizationProfile(trt_profile);
      }
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@ -175,7 +175,6 @@ struct TensorrtFuncState {
  bool int8_calibration_cache_available = false;
  bool dla_enable = false;
  int dla_core = 0;
-  size_t* max_workspace_size_ptr = nullptr;
  std::string trt_node_name_with_precision;
  bool engine_cache_enable = false;
  std::string engine_cache_path;
@ -290,7 +289,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
  cudaStream_t stream_ = nullptr;
  int max_partition_iterations_ = 1000;
  size_t min_subgraph_size_ = 1;
-  size_t max_workspace_size_ = 1 << 30;  // 1GB
+  size_t max_workspace_size_ = 0;
  bool fp16_enable_ = false;
  bool int8_enable_ = false;
  bool dla_enable_ = false;
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@ -22,7 +22,7 @@ struct TensorrtExecutionProviderInfo {
  bool has_trt_options{false};
  int max_partition_iterations{1000};
  int min_subgraph_size{1};
-  size_t max_workspace_size{1 << 30};
+  size_t max_workspace_size{0};
  bool fp16_enable{false};
  bool int8_enable{false};
  std::string int8_calibration_table_name{""};