From a6972c8782f1477b8ef557e2c54ac0075beacdb8 Mon Sep 17 00:00:00 2001 From: stevenlix <38092805+stevenlix@users.noreply.github.com> Date: Mon, 17 May 2021 23:07:27 -0700 Subject: [PATCH] Fix issues in TensorRT provider options (#7738) * add legacy env variable support in pybind * formating code --- .../tensorrt/tensorrt_execution_provider.cc | 65 ++++++++++++------- .../python/onnxruntime_pybind_state.cc | 44 +++++++------ .../test/perftest/command_args_parser.cc | 2 - .../test/python/onnxruntime_test_python.py | 37 +---------- 4 files changed, 66 insertions(+), 82 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index ff05b6bea3..32d77142f5 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -145,7 +145,7 @@ bool FindCycleHelper(int i, const std::list* adjacency_map, bool visited[], * Read calibration table for INT8 quantization * Two kind of calibration tables are supported, * 1. ORT generated calibration table -* The table is pre-serialized by flatbuffers. +* The table is pre-serialized by flatbuffers. * Each entry in the table is a key-value pair, * key: tensor name, value: maximum absolute value in floating point * For example, @@ -404,7 +404,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv int8_calibration_cache_name_ = info.int8_calibration_table_name; int8_use_native_tensorrt_calibration_table_ = info.int8_use_native_calibration_table; } - if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8 + if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8 dla_enable_ = info.dla_enable; dla_core_ = info.dla_core; } @@ -456,13 +456,13 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv } } - if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8 + if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8 const std::string dla_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLAEnable); if (!dla_enable_env.empty()) { dla_enable_ = (std::stoi(dla_enable_env) == 0 ? false : true); } - - if (dla_enable_) { + + if (dla_enable_) { const std::string dla_core_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLACore); if (!dla_core_env.empty()) { dla_core_ = std::stoi(dla_core_env); @@ -488,12 +488,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_ENGINE_CACHE_PATH is deprecated! Please use ORT_TENSORRT_CACHE_PATH to specify engine cache path"; } } - + const std::string engine_decryption_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDecryptionEnable); if (!engine_decryption_enable_env.empty()) { engine_decryption_enable_ = (std::stoi(engine_decryption_enable_env) == 0 ? false : true); } - + if (engine_decryption_enable_) { engine_decryption_lib_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDecryptionLibPath); } @@ -510,15 +510,15 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv max_partition_iterations_ = 1000; } if (min_subgraph_size_ <= 0) { - LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1"; + LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1"; min_subgraph_size_ = 1; - } + } if (max_workspace_size_ <= 0) { - LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)"; + LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)"; max_workspace_size_ = 1 << 30; } if (dla_core_ < 0) { - LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0"; + LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0"; dla_core_ = 0; } @@ -534,11 +534,28 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv if (engine_decryption_enable_) { LIBTYPE handle = OPENLIB(engine_decryption_lib_path_.c_str()); if (handle == nullptr) { - ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, - "TensorRT EP could not open shared library from " + engine_decryption_lib_path_); + ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not open shared library from " + engine_decryption_lib_path_)); } engine_decryption_ = (int (*)(const char*, char*, size_t*))LIBFUNC(handle, "decrypt"); } + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT provider options: " + << "device_id: " << device_id_ + << ", trt_max_partition_iterations: " << max_partition_iterations_ + << ", trt_min_subgraph_size: " << min_subgraph_size_ + << ", trt_max_workspace_size: " << max_workspace_size_ + << ", trt_fp16_enable: " << fp16_enable_ + << ", trt_int8_enable: " << int8_enable_ + << ", trt_int8_calibration_cache_name: " << int8_calibration_cache_name_ + << ", trt_int8_use_native_tensorrt_calibration_table: " << int8_use_native_tensorrt_calibration_table_ + << ", trt_dla_enable: " << dla_enable_ + << ", trt_dla_core: " << dla_core_ + << ", trt_dump_subgraphs: " << dump_subgraphs_ + << ", trt_engine_cache_enable: " << engine_cache_enable_ + << ", trt_cache_path: " << cache_path_ + << ", trt_engine_decryption_enable: " << engine_decryption_enable_ + << ", trt_engine_decryption_lib_path: " << engine_decryption_lib_path_ + << ", trt_force_sequential_engine_build: " << force_sequential_engine_build_; } TensorrtExecutionProvider::~TensorrtExecutionProvider() { @@ -862,7 +879,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect model_proto->SerializeToString(string_buf); if (dump_subgraphs_) { - // Dump TensorRT subgraph for debugging if enabled via ORT_TENSORRT_DUMP_SUBGRAPHS env variable. + // Dump TensorRT subgraph for debugging std::fstream dump("TensorrtExecutionProvider_TRT_Subgraph.onnx", std::ios::out | std::ios::trunc | std::ios::binary); model_proto->SerializeToOstream(dump); } @@ -1098,7 +1115,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse model_proto->SerializeToString(string_buf); if (dump_subgraphs_) { - // Dump the TensorRT subgraph if enabled via ORT_TENSORRT_DUMP_SUBGRAPHS env variable. + // Dump TensorRT subgraphs std::fstream dump(fused_node->Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary); model_proto->SerializeToOstream(dump); } @@ -1183,11 +1200,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse // Set DLA if (fp16_enable_ || int8_enable_) { - if (dla_enable_ && dla_core_ >= 0) {//DLA can only run with FP16 and INT8 + if (dla_enable_ && dla_core_ >= 0) { //DLA can only run with FP16 and INT8 int number_of_dla_core = trt_builder->getNbDLACores(); if (number_of_dla_core == 0) { LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core"; - dla_enable_ = false; + dla_enable_ = false; } else { if (dla_core_ >= number_of_dla_core) { LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead."; @@ -1198,7 +1215,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); trt_config->setDLACore(dla_core_); trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_); - } + } } } @@ -1318,7 +1335,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse *p = {context->allocate_func, context->release_func, context->allocator_handle, &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], - input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_, + input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), nullptr, allocator_, dynamic_range_map, engine_decryption_enable_, engine_decryption_}; *state = p.release(); @@ -1579,10 +1596,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse // Set DLA (DLA can only run with FP16 or INT8) if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) { - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core; - trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); - trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); - trt_config->setDLACore(trt_state->dla_core); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core; + trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); + trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); + trt_config->setDLACore(trt_state->dla_core); } // Build engine @@ -1883,4 +1900,4 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse } return Status::OK(); } -} // namespace onnxruntime +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index a9c422bd78..129d3e1a55 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -208,6 +208,7 @@ const OrtDevice::DeviceType OrtDevice::GPU; namespace onnxruntime { std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params); +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(int device_id); std::shared_ptr CreateExecutionProviderFactory_MIGraphX(int device_id); std::shared_ptr CreateExecutionProviderFactory_Dnnl(int use_arena); std::shared_ptr CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params); @@ -575,29 +576,28 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector sess->GetSessionOptions().enable_cpu_mem_arena)); } else if (type == kTensorrtExecutionProvider) { #ifdef USE_TENSORRT - OrtTensorRTProviderOptions params{ - 0, - 0, - nullptr, - 1000, - 1, - 1 << 30, - 0, - 0, - nullptr, - 0, - 0, - 0, - 0, - 0, - nullptr, - 0, - nullptr, - 0}; - std::string calibration_table, cache_path, lib_path; auto it = provider_options_map.find(type); if (it != provider_options_map.end()) { + OrtTensorRTProviderOptions params{ + 0, + 0, + nullptr, + 1000, + 1, + 1 << 30, + 0, + 0, + nullptr, + 0, + 0, + 0, + 0, + 0, + nullptr, + 0, + nullptr, + 0}; for (auto option : it->second) { if (option.first == "device_id") { if (!option.second.empty()) { @@ -718,8 +718,10 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector ORT_THROW("Invalid TensorRT EP option: ", option.first); } } + RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(¶ms)); + } else { + RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(cuda_device_id)); } - RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(¶ms)); #endif } else if (type == kMIGraphXExecutionProvider) { #ifdef USE_MIGRAPHX diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 18835ca2d9..7ad02d7b32 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -76,8 +76,6 @@ namespace perftest { "\t [TensorRT only] [trt_dump_subgraphs]: Dump TRT subgraph to onnx model.\n" "\t [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n" "\t [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n" - "\t [TensorRT only] [trt_engine_decryption_enable]: (experimental feature) Enable engine decryption.\n" - "\t [TensorRT only] [trt_engine_decryption_lib_path]: (experimental feature) Specify engine decryption library path.\n" "\t [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n" "\t [Usage]: -e -i '| |'\n\n" "\t [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n" diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index cdb543ef84..a9f45f82fa 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -72,53 +72,28 @@ class TestInferenceSession(unittest.TestCase): self.assertIn('trt_max_partition_iterations', option) self.assertIn('trt_min_subgraph_size', option) self.assertIn('trt_max_workspace_size', option) - self.assertIn('trt_fp16_enable', option) - self.assertIn('trt_int8_enable', option) - self.assertIn('trt_int8_calibration_table_name', option) - self.assertIn('trt_int8_use_native_calibration_table', option) - self.assertIn('trt_dla_enable', option) - self.assertIn('trt_dla_core', option) self.assertIn('trt_dump_subgraphs', option) self.assertIn('trt_engine_cache_enable', option) self.assertIn('trt_engine_cache_path', option) - self.assertIn('trt_engine_decryption_enable', option) - self.assertIn('trt_engine_decryption_lib_path', option) self.assertIn('trt_force_sequential_engine_build', option) max_partition_iterations = option['trt_max_partition_iterations'] new_max_partition_iterations = int(max_partition_iterations) + 1 min_subgraph_size = option['trt_min_subgraph_size'] - new_min_subgraph_size = int(max_partition_iterations) + 1 + new_min_subgraph_size = int(min_subgraph_size) + 1 ori_max_workspace_size = option['trt_max_workspace_size'] new_max_workspace_size = int(ori_max_workspace_size) // 2 - dla_core = option['trt_dla_core'] - new_dla_core = int(dla_core) + 1 option = {} option['trt_max_partition_iterations'] = new_max_partition_iterations option['trt_min_subgraph_size'] = new_min_subgraph_size option['trt_max_workspace_size'] = new_max_workspace_size - fp16_enable = "true" - option['trt_fp16_enable'] = fp16_enable - int8_enable = "false" - option['trt_int8_enable'] = int8_enable - calib_table_name = '/home/onnxruntime/table.flatbuffers' - option['trt_int8_calibration_table_name'] = calib_table_name - int8_use_native_calibration_table = "true" - option['trt_int8_use_native_calibration_table'] = int8_use_native_calibration_table - dla_enable = "true" - option['trt_dla_enable'] = dla_enable - option['trt_dla_core'] = new_dla_core dump_subgraphs = "true" option['trt_dump_subgraphs'] = dump_subgraphs engine_cache_enable = "true" option['trt_engine_cache_enable'] = engine_cache_enable - engine_cache_path = '/home/onnxruntime/engine_cache' + engine_cache_path = './engine_cache' option['trt_engine_cache_path'] = engine_cache_path - engine_decryption_enable = "true" - option['trt_engine_decryption_enable'] = engine_decryption_enable - engine_decryption_lib_path = '/home/onnxruntime/decryption_lib' - option['trt_engine_decryption_lib_path'] = engine_decryption_lib_path force_sequential_engine_build = "true" option['trt_force_sequential_engine_build'] = force_sequential_engine_build sess.set_providers(['TensorrtExecutionProvider'], [option]) @@ -128,17 +103,9 @@ class TestInferenceSession(unittest.TestCase): self.assertEqual(option['trt_max_partition_iterations'], str(new_max_partition_iterations)) self.assertEqual(option['trt_min_subgraph_size'], str(new_min_subgraph_size)) self.assertEqual(option['trt_max_workspace_size'], str(new_max_workspace_size)) - self.assertEqual(option['trt_int8_calibration_table_name'], str(calib_table_name)) - self.assertEqual(option['trt_fp16_enable'], '1') - self.assertEqual(option['trt_int8_enable'], '0') - self.assertEqual(option['trt_int8_use_native_calibration_table'], '1') - self.assertEqual(option['trt_dla_enable'], '1') - self.assertEqual(option['trt_dla_core'], str(new_dla_core)) self.assertEqual(option['trt_dump_subgraphs'], '1') self.assertEqual(option['trt_engine_cache_enable'], '1') self.assertEqual(option['trt_engine_cache_path'], str(engine_cache_path)) - self.assertEqual(option['trt_engine_decryption_enable'], '1') - self.assertEqual(option['engine_decryption_lib_path'], str(engine_decryption_lib_path)) self.assertEqual(option['trt_force_sequential_engine_build'], '1') # We currently disable following test code since that not all test machines/GPUs have nvidia int8 capability