diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h index 816eaaf9bc..ec9be80a63 100644 --- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h +++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h @@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 { // can be updated using: UpdateTensorRTProviderOptionsWithValue int trt_max_partition_iterations{1000}; // maximum iterations for TensorRT parser to get capability int trt_min_subgraph_size{1}; // minimum size of TensorRT subgraphs - size_t trt_max_workspace_size{1 << 30}; // maximum workspace size for TensorRT. + size_t trt_max_workspace_size{0}; // maximum workspace size for TensorRT. Default is 0 means max device memory size int trt_fp16_enable{0}; // enable TensorRT FP16 precision. Default 0 = false, nonzero = true int trt_int8_enable{0}; // enable TensorRT INT8 precision. Default 0 = false, nonzero = true const char* trt_int8_calibration_table_name{nullptr}; // TensorRT INT8 calibration table name. diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 0f32b58314..a7daa98902 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1583,10 +1583,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1"; min_subgraph_size_ = 1; } - if (max_workspace_size_ <= 0) { - LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)"; - max_workspace_size_ = 1 << 30; - } if (dla_core_ < 0) { LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0"; dla_core_ = 0; @@ -2756,7 +2752,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView auto trt_config = std::unique_ptr(trt_builder->createBuilderConfig()); auto trt_parser = tensorrt_ptr::unique_pointer(nvonnxparser::createParser(*trt_network, trt_logger)); trt_parser->parse(string_buf.data(), string_buf.size(), model_path_); - trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); + if (max_workspace_size_ > 0) { + trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); + } // Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow if (fp16_enable_ && layer_norm_fp32_fallback_) { @@ -3363,7 +3361,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, - dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, + dla_enable_, dla_core_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_, global_cache_path_, force_timing_cache_match_, @@ -3538,7 +3536,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView trt_state->context->reset(); trt_state->engine->reset(); auto trt_config = std::unique_ptr(trt_builder->createBuilderConfig()); - trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr)); + if (max_workspace_size_ > 0) { + trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); + } for (auto trt_profile : trt_profiles) { trt_config->addOptimizationProfile(trt_profile); } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 3f20314438..97c9367b0b 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -175,7 +175,6 @@ struct TensorrtFuncState { bool int8_calibration_cache_available = false; bool dla_enable = false; int dla_core = 0; - size_t* max_workspace_size_ptr = nullptr; std::string trt_node_name_with_precision; bool engine_cache_enable = false; std::string engine_cache_path; @@ -290,7 +289,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { cudaStream_t stream_ = nullptr; int max_partition_iterations_ = 1000; size_t min_subgraph_size_ = 1; - size_t max_workspace_size_ = 1 << 30; // 1GB + size_t max_workspace_size_ = 0; bool fp16_enable_ = false; bool int8_enable_ = false; bool dla_enable_ = false; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h index 50b934fd5f..fa1bbd6d3d 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h @@ -22,7 +22,7 @@ struct TensorrtExecutionProviderInfo { bool has_trt_options{false}; int max_partition_iterations{1000}; int min_subgraph_size{1}; - size_t max_workspace_size{1 << 30}; + size_t max_workspace_size{0}; bool fp16_enable{false}; bool int8_enable{false}; std::string int8_calibration_table_name{""};