From a6972c8782f1477b8ef557e2c54ac0075beacdb8 Mon Sep 17 00:00:00 2001
From: stevenlix <38092805+stevenlix@users.noreply.github.com>
Date: Mon, 17 May 2021 23:07:27 -0700
Subject: [PATCH] Fix issues in TensorRT provider options (#7738)

* add legacy env variable support in pybind

* formating code
---
 .../tensorrt/tensorrt_execution_provider.cc   | 65 ++++++++++++-------
 .../python/onnxruntime_pybind_state.cc        | 44 +++++++------
 .../test/perftest/command_args_parser.cc      |  2 -
 .../test/python/onnxruntime_test_python.py    | 37 +----------
 4 files changed, 66 insertions(+), 82 deletions(-)
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index ff05b6bea3..32d77142f5 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -145,7 +145,7 @@ bool FindCycleHelper(int i, const std::list<int>* adjacency_map, bool visited[],
 * Read calibration table for INT8 quantization
 * Two kind of calibration tables are supported,
 * 1. ORT generated calibration table
-* The table is pre-serialized by flatbuffers. 
+* The table is pre-serialized by flatbuffers.
 * Each entry in the table is a key-value pair,
 * key: tensor name, value: maximum absolute value in floating point
 * For example,
@@ -404,7 +404,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
       int8_calibration_cache_name_ = info.int8_calibration_table_name;
       int8_use_native_tensorrt_calibration_table_ = info.int8_use_native_calibration_table;
     }
-    if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
+    if (fp16_enable_ || int8_enable_) {  // DLA can only be enabled with FP16 or INT8
       dla_enable_ = info.dla_enable;
       dla_core_ = info.dla_core;
     }
@@ -456,13 +456,13 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
       }
     }
 
-    if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
+    if (fp16_enable_ || int8_enable_) {  // DLA can only be enabled with FP16 or INT8
       const std::string dla_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLAEnable);
       if (!dla_enable_env.empty()) {
         dla_enable_ = (std::stoi(dla_enable_env) == 0 ? false : true);
       }
-	  
-	  if (dla_enable_) {
+
+      if (dla_enable_) {
         const std::string dla_core_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLACore);
         if (!dla_core_env.empty()) {
           dla_core_ = std::stoi(dla_core_env);
@@ -488,12 +488,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_ENGINE_CACHE_PATH is deprecated! Please use ORT_TENSORRT_CACHE_PATH to specify engine cache path";
       }
     }
-    
+
     const std::string engine_decryption_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDecryptionEnable);
     if (!engine_decryption_enable_env.empty()) {
       engine_decryption_enable_ = (std::stoi(engine_decryption_enable_env) == 0 ? false : true);
     }
-    
+
     if (engine_decryption_enable_) {
       engine_decryption_lib_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDecryptionLibPath);
     }
@@ -510,15 +510,15 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     max_partition_iterations_ = 1000;
   }
   if (min_subgraph_size_ <= 0) {
-	LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
     min_subgraph_size_ = 1;
-  }	
+  }
   if (max_workspace_size_ <= 0) {
-	LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
     max_workspace_size_ = 1 << 30;
   }
   if (dla_core_ < 0) {
-	LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
     dla_core_ = 0;
   }
 
@@ -534,11 +534,28 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
   if (engine_decryption_enable_) {
     LIBTYPE handle = OPENLIB(engine_decryption_lib_path_.c_str());
     if (handle == nullptr) {
-      ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                      "TensorRT EP could not open shared library from " + engine_decryption_lib_path_);
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "TensorRT EP could not open shared library from " + engine_decryption_lib_path_));
     }
     engine_decryption_ = (int (*)(const char*, char*, size_t*))LIBFUNC(handle, "decrypt");
   }
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT provider options: "
+                        << "device_id: " << device_id_
+                        << ", trt_max_partition_iterations: " << max_partition_iterations_
+                        << ", trt_min_subgraph_size: " << min_subgraph_size_
+                        << ", trt_max_workspace_size: " << max_workspace_size_
+                        << ", trt_fp16_enable: " << fp16_enable_
+                        << ", trt_int8_enable: " << int8_enable_
+                        << ", trt_int8_calibration_cache_name: " << int8_calibration_cache_name_
+                        << ", trt_int8_use_native_tensorrt_calibration_table: " << int8_use_native_tensorrt_calibration_table_
+                        << ", trt_dla_enable: " << dla_enable_
+                        << ", trt_dla_core: " << dla_core_
+                        << ", trt_dump_subgraphs: " << dump_subgraphs_
+                        << ", trt_engine_cache_enable: " << engine_cache_enable_
+                        << ", trt_cache_path: " << cache_path_
+                        << ", trt_engine_decryption_enable: " << engine_decryption_enable_
+                        << ", trt_engine_decryption_lib_path: " << engine_decryption_lib_path_
+                        << ", trt_force_sequential_engine_build: " << force_sequential_engine_build_;
 }
 
 TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@@ -862,7 +879,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
         model_proto->SerializeToString(string_buf);
 
         if (dump_subgraphs_) {
-          // Dump TensorRT subgraph for debugging if enabled via ORT_TENSORRT_DUMP_SUBGRAPHS env variable.
+          // Dump TensorRT subgraph for debugging
           std::fstream dump("TensorrtExecutionProvider_TRT_Subgraph.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
           model_proto->SerializeToOstream(dump);
         }
@@ -1098,7 +1115,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
     model_proto->SerializeToString(string_buf);
 
     if (dump_subgraphs_) {
-      // Dump the TensorRT subgraph if enabled via ORT_TENSORRT_DUMP_SUBGRAPHS env variable.
+      // Dump TensorRT subgraphs
       std::fstream dump(fused_node->Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
       model_proto->SerializeToOstream(dump);
     }
@@ -1183,11 +1200,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
 
     // Set DLA
     if (fp16_enable_ || int8_enable_) {
-      if (dla_enable_ && dla_core_ >= 0) {//DLA can only run with FP16 and INT8
+      if (dla_enable_ && dla_core_ >= 0) {  //DLA can only run with FP16 and INT8
         int number_of_dla_core = trt_builder->getNbDLACores();
         if (number_of_dla_core == 0) {
           LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core";
-		  dla_enable_ = false;
+          dla_enable_ = false;
         } else {
           if (dla_core_ >= number_of_dla_core) {
             LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead.";
@@ -1198,7 +1215,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
           trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
           trt_config->setDLACore(dla_core_);
           trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_);
-		}
+        }
       }
     }
 
@@ -1318,7 +1335,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
       *p = {context->allocate_func, context->release_func, context->allocator_handle, &parsers_[context->node_name],
             &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name],
             &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
-            input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_, 
+            input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_,
             dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), nullptr,
             allocator_, dynamic_range_map, engine_decryption_enable_, engine_decryption_};
       *state = p.release();
@@ -1579,10 +1596,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
 
         // Set DLA (DLA can only run with FP16 or INT8)
         if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
-            trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-            trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-            trt_config->setDLACore(trt_state->dla_core);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
+          trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+          trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+          trt_config->setDLACore(trt_state->dla_core);
         }
 
         // Build engine
@@ -1883,4 +1900,4 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
   }
   return Status::OK();
 }
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index a9c422bd78..129d3e1a55 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -208,6 +208,7 @@ const OrtDevice::DeviceType OrtDevice::GPU;
 namespace onnxruntime {
 
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(int device_id);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(int device_id);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params);
@@ -575,29 +576,28 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
                                           sess->GetSessionOptions().enable_cpu_mem_arena));
     } else if (type == kTensorrtExecutionProvider) {
 #ifdef USE_TENSORRT
-      OrtTensorRTProviderOptions params{
-          0,
-          0,
-          nullptr,
-          1000,
-          1,
-          1 << 30,
-          0,
-          0,
-          nullptr,
-          0,
-          0,
-          0,
-          0,
-          0,
-          nullptr,
-          0,
-          nullptr,
-          0};
-
       std::string calibration_table, cache_path, lib_path;
       auto it = provider_options_map.find(type);
       if (it != provider_options_map.end()) {
+        OrtTensorRTProviderOptions params{
+            0,
+            0,
+            nullptr,
+            1000,
+            1,
+            1 << 30,
+            0,
+            0,
+            nullptr,
+            0,
+            0,
+            0,
+            0,
+            0,
+            nullptr,
+            0,
+            nullptr,
+            0};
         for (auto option : it->second) {
           if (option.first == "device_id") {
             if (!option.second.empty()) {
@@ -718,8 +718,10 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
             ORT_THROW("Invalid TensorRT EP option: ", option.first);
           }
         }
+        RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(&params));
+      } else {
+        RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(cuda_device_id));
       }
-      RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(&params));
 #endif
     } else if (type == kMIGraphXExecutionProvider) {
 #ifdef USE_MIGRAPHX
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 18835ca2d9..7ad02d7b32 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -76,8 +76,6 @@ namespace perftest {
       "\t    [TensorRT only] [trt_dump_subgraphs]: Dump TRT subgraph to onnx model.\n"
       "\t    [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n"
       "\t    [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n"
-      "\t    [TensorRT only] [trt_engine_decryption_enable]: (experimental feature) Enable engine decryption.\n"
-      "\t    [TensorRT only] [trt_engine_decryption_lib_path]: (experimental feature) Specify engine decryption library path.\n"
       "\t    [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
       "\t [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index cdb543ef84..a9f45f82fa 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -72,53 +72,28 @@ class TestInferenceSession(unittest.TestCase):
             self.assertIn('trt_max_partition_iterations', option)
             self.assertIn('trt_min_subgraph_size', option)
             self.assertIn('trt_max_workspace_size', option)
-            self.assertIn('trt_fp16_enable', option)
-            self.assertIn('trt_int8_enable', option)
-            self.assertIn('trt_int8_calibration_table_name', option)
-            self.assertIn('trt_int8_use_native_calibration_table', option)
-            self.assertIn('trt_dla_enable', option)      
-            self.assertIn('trt_dla_core', option)
             self.assertIn('trt_dump_subgraphs', option)
             self.assertIn('trt_engine_cache_enable', option)
             self.assertIn('trt_engine_cache_path', option)
-            self.assertIn('trt_engine_decryption_enable', option)
-            self.assertIn('trt_engine_decryption_lib_path', option)
             self.assertIn('trt_force_sequential_engine_build', option)
 
             max_partition_iterations = option['trt_max_partition_iterations']
             new_max_partition_iterations = int(max_partition_iterations) + 1
             min_subgraph_size = option['trt_min_subgraph_size']
-            new_min_subgraph_size = int(max_partition_iterations) + 1
+            new_min_subgraph_size = int(min_subgraph_size) + 1
             ori_max_workspace_size = option['trt_max_workspace_size']
             new_max_workspace_size = int(ori_max_workspace_size) // 2
-            dla_core = option['trt_dla_core']
-            new_dla_core = int(dla_core) + 1
 
             option = {}
             option['trt_max_partition_iterations'] = new_max_partition_iterations
             option['trt_min_subgraph_size'] = new_min_subgraph_size
             option['trt_max_workspace_size'] = new_max_workspace_size
-            fp16_enable = "true"
-            option['trt_fp16_enable'] = fp16_enable
-            int8_enable = "false"
-            option['trt_int8_enable'] = int8_enable
-            calib_table_name = '/home/onnxruntime/table.flatbuffers'
-            option['trt_int8_calibration_table_name'] = calib_table_name
-            int8_use_native_calibration_table = "true"
-            option['trt_int8_use_native_calibration_table'] = int8_use_native_calibration_table
-            dla_enable = "true"
-            option['trt_dla_enable'] = dla_enable
-            option['trt_dla_core'] = new_dla_core
             dump_subgraphs = "true"
             option['trt_dump_subgraphs'] = dump_subgraphs
             engine_cache_enable = "true"
             option['trt_engine_cache_enable'] = engine_cache_enable
-            engine_cache_path = '/home/onnxruntime/engine_cache'
+            engine_cache_path = './engine_cache'
             option['trt_engine_cache_path'] = engine_cache_path
-            engine_decryption_enable = "true"
-            option['trt_engine_decryption_enable'] = engine_decryption_enable
-            engine_decryption_lib_path = '/home/onnxruntime/decryption_lib'
-            option['trt_engine_decryption_lib_path'] = engine_decryption_lib_path
             force_sequential_engine_build = "true"
             option['trt_force_sequential_engine_build'] = force_sequential_engine_build
             sess.set_providers(['TensorrtExecutionProvider'], [option])
@@ -128,17 +103,9 @@ class TestInferenceSession(unittest.TestCase):
             self.assertEqual(option['trt_max_partition_iterations'], str(new_max_partition_iterations))
             self.assertEqual(option['trt_min_subgraph_size'], str(new_min_subgraph_size))
             self.assertEqual(option['trt_max_workspace_size'], str(new_max_workspace_size))
-            self.assertEqual(option['trt_int8_calibration_table_name'], str(calib_table_name))
-            self.assertEqual(option['trt_fp16_enable'], '1')
-            self.assertEqual(option['trt_int8_enable'], '0')
-            self.assertEqual(option['trt_int8_use_native_calibration_table'], '1')
-            self.assertEqual(option['trt_dla_enable'], '1')
-            self.assertEqual(option['trt_dla_core'], str(new_dla_core))
             self.assertEqual(option['trt_dump_subgraphs'], '1')
             self.assertEqual(option['trt_engine_cache_enable'], '1')
             self.assertEqual(option['trt_engine_cache_path'], str(engine_cache_path))
-            self.assertEqual(option['trt_engine_decryption_enable'], '1')
-            self.assertEqual(option['engine_decryption_lib_path'], str(engine_decryption_lib_path))
             self.assertEqual(option['trt_force_sequential_engine_build'], '1')
 
             # We currently disable following test code since that not all test machines/GPUs have nvidia int8 capability