Fix issues in TensorRT provider options (#7738)

* add legacy env variable support in pybind

* formating code
This commit is contained in:
stevenlix 2021-05-17 23:07:27 -07:00 committed by GitHub
parent e9057d2e49
commit a6972c8782
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 66 additions and 82 deletions

View file

@ -145,7 +145,7 @@ bool FindCycleHelper(int i, const std::list<int>* adjacency_map, bool visited[],
* Read calibration table for INT8 quantization
* Two kind of calibration tables are supported,
* 1. ORT generated calibration table
* The table is pre-serialized by flatbuffers.
* The table is pre-serialized by flatbuffers.
* Each entry in the table is a key-value pair,
* key: tensor name, value: maximum absolute value in floating point
* For example,
@ -404,7 +404,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
int8_calibration_cache_name_ = info.int8_calibration_table_name;
int8_use_native_tensorrt_calibration_table_ = info.int8_use_native_calibration_table;
}
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
dla_enable_ = info.dla_enable;
dla_core_ = info.dla_core;
}
@ -456,13 +456,13 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
}
}
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
const std::string dla_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLAEnable);
if (!dla_enable_env.empty()) {
dla_enable_ = (std::stoi(dla_enable_env) == 0 ? false : true);
}
if (dla_enable_) {
if (dla_enable_) {
const std::string dla_core_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLACore);
if (!dla_core_env.empty()) {
dla_core_ = std::stoi(dla_core_env);
@ -488,12 +488,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_ENGINE_CACHE_PATH is deprecated! Please use ORT_TENSORRT_CACHE_PATH to specify engine cache path";
}
}
const std::string engine_decryption_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDecryptionEnable);
if (!engine_decryption_enable_env.empty()) {
engine_decryption_enable_ = (std::stoi(engine_decryption_enable_env) == 0 ? false : true);
}
if (engine_decryption_enable_) {
engine_decryption_lib_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDecryptionLibPath);
}
@ -510,15 +510,15 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
max_partition_iterations_ = 1000;
}
if (min_subgraph_size_ <= 0) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
min_subgraph_size_ = 1;
}
}
if (max_workspace_size_ <= 0) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
max_workspace_size_ = 1 << 30;
}
if (dla_core_ < 0) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
dla_core_ = 0;
}
@ -534,11 +534,28 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
if (engine_decryption_enable_) {
LIBTYPE handle = OPENLIB(engine_decryption_lib_path_.c_str());
if (handle == nullptr) {
ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not open shared library from " + engine_decryption_lib_path_);
ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not open shared library from " + engine_decryption_lib_path_));
}
engine_decryption_ = (int (*)(const char*, char*, size_t*))LIBFUNC(handle, "decrypt");
}
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT provider options: "
<< "device_id: " << device_id_
<< ", trt_max_partition_iterations: " << max_partition_iterations_
<< ", trt_min_subgraph_size: " << min_subgraph_size_
<< ", trt_max_workspace_size: " << max_workspace_size_
<< ", trt_fp16_enable: " << fp16_enable_
<< ", trt_int8_enable: " << int8_enable_
<< ", trt_int8_calibration_cache_name: " << int8_calibration_cache_name_
<< ", trt_int8_use_native_tensorrt_calibration_table: " << int8_use_native_tensorrt_calibration_table_
<< ", trt_dla_enable: " << dla_enable_
<< ", trt_dla_core: " << dla_core_
<< ", trt_dump_subgraphs: " << dump_subgraphs_
<< ", trt_engine_cache_enable: " << engine_cache_enable_
<< ", trt_cache_path: " << cache_path_
<< ", trt_engine_decryption_enable: " << engine_decryption_enable_
<< ", trt_engine_decryption_lib_path: " << engine_decryption_lib_path_
<< ", trt_force_sequential_engine_build: " << force_sequential_engine_build_;
}
TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@ -862,7 +879,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
model_proto->SerializeToString(string_buf);
if (dump_subgraphs_) {
// Dump TensorRT subgraph for debugging if enabled via ORT_TENSORRT_DUMP_SUBGRAPHS env variable.
// Dump TensorRT subgraph for debugging
std::fstream dump("TensorrtExecutionProvider_TRT_Subgraph.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
model_proto->SerializeToOstream(dump);
}
@ -1098,7 +1115,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
model_proto->SerializeToString(string_buf);
if (dump_subgraphs_) {
// Dump the TensorRT subgraph if enabled via ORT_TENSORRT_DUMP_SUBGRAPHS env variable.
// Dump TensorRT subgraphs
std::fstream dump(fused_node->Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
model_proto->SerializeToOstream(dump);
}
@ -1183,11 +1200,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
// Set DLA
if (fp16_enable_ || int8_enable_) {
if (dla_enable_ && dla_core_ >= 0) {//DLA can only run with FP16 and INT8
if (dla_enable_ && dla_core_ >= 0) { //DLA can only run with FP16 and INT8
int number_of_dla_core = trt_builder->getNbDLACores();
if (number_of_dla_core == 0) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core";
dla_enable_ = false;
dla_enable_ = false;
} else {
if (dla_core_ >= number_of_dla_core) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead.";
@ -1198,7 +1215,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
trt_config->setDLACore(dla_core_);
trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_);
}
}
}
}
@ -1318,7 +1335,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
*p = {context->allocate_func, context->release_func, context->allocator_handle, &parsers_[context->node_name],
&engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name],
&networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_,
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_,
dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), nullptr,
allocator_, dynamic_range_map, engine_decryption_enable_, engine_decryption_};
*state = p.release();
@ -1579,10 +1596,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
// Set DLA (DLA can only run with FP16 or INT8)
if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
trt_config->setDLACore(trt_state->dla_core);
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
trt_config->setDLACore(trt_state->dla_core);
}
// Build engine
@ -1883,4 +1900,4 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
}
return Status::OK();
}
} // namespace onnxruntime
} // namespace onnxruntime

View file

@ -208,6 +208,7 @@ const OrtDevice::DeviceType OrtDevice::GPU;
namespace onnxruntime {
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(int device_id);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(int device_id);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params);
@ -575,29 +576,28 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
sess->GetSessionOptions().enable_cpu_mem_arena));
} else if (type == kTensorrtExecutionProvider) {
#ifdef USE_TENSORRT
OrtTensorRTProviderOptions params{
0,
0,
nullptr,
1000,
1,
1 << 30,
0,
0,
nullptr,
0,
0,
0,
0,
0,
nullptr,
0,
nullptr,
0};
std::string calibration_table, cache_path, lib_path;
auto it = provider_options_map.find(type);
if (it != provider_options_map.end()) {
OrtTensorRTProviderOptions params{
0,
0,
nullptr,
1000,
1,
1 << 30,
0,
0,
nullptr,
0,
0,
0,
0,
0,
nullptr,
0,
nullptr,
0};
for (auto option : it->second) {
if (option.first == "device_id") {
if (!option.second.empty()) {
@ -718,8 +718,10 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
ORT_THROW("Invalid TensorRT EP option: ", option.first);
}
}
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(&params));
} else {
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(cuda_device_id));
}
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(&params));
#endif
} else if (type == kMIGraphXExecutionProvider) {
#ifdef USE_MIGRAPHX

View file

@ -76,8 +76,6 @@ namespace perftest {
"\t [TensorRT only] [trt_dump_subgraphs]: Dump TRT subgraph to onnx model.\n"
"\t [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n"
"\t [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n"
"\t [TensorRT only] [trt_engine_decryption_enable]: (experimental feature) Enable engine decryption.\n"
"\t [TensorRT only] [trt_engine_decryption_lib_path]: (experimental feature) Specify engine decryption library path.\n"
"\t [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
"\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
"\t [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"

View file

@ -72,53 +72,28 @@ class TestInferenceSession(unittest.TestCase):
self.assertIn('trt_max_partition_iterations', option)
self.assertIn('trt_min_subgraph_size', option)
self.assertIn('trt_max_workspace_size', option)
self.assertIn('trt_fp16_enable', option)
self.assertIn('trt_int8_enable', option)
self.assertIn('trt_int8_calibration_table_name', option)
self.assertIn('trt_int8_use_native_calibration_table', option)
self.assertIn('trt_dla_enable', option)
self.assertIn('trt_dla_core', option)
self.assertIn('trt_dump_subgraphs', option)
self.assertIn('trt_engine_cache_enable', option)
self.assertIn('trt_engine_cache_path', option)
self.assertIn('trt_engine_decryption_enable', option)
self.assertIn('trt_engine_decryption_lib_path', option)
self.assertIn('trt_force_sequential_engine_build', option)
max_partition_iterations = option['trt_max_partition_iterations']
new_max_partition_iterations = int(max_partition_iterations) + 1
min_subgraph_size = option['trt_min_subgraph_size']
new_min_subgraph_size = int(max_partition_iterations) + 1
new_min_subgraph_size = int(min_subgraph_size) + 1
ori_max_workspace_size = option['trt_max_workspace_size']
new_max_workspace_size = int(ori_max_workspace_size) // 2
dla_core = option['trt_dla_core']
new_dla_core = int(dla_core) + 1
option = {}
option['trt_max_partition_iterations'] = new_max_partition_iterations
option['trt_min_subgraph_size'] = new_min_subgraph_size
option['trt_max_workspace_size'] = new_max_workspace_size
fp16_enable = "true"
option['trt_fp16_enable'] = fp16_enable
int8_enable = "false"
option['trt_int8_enable'] = int8_enable
calib_table_name = '/home/onnxruntime/table.flatbuffers'
option['trt_int8_calibration_table_name'] = calib_table_name
int8_use_native_calibration_table = "true"
option['trt_int8_use_native_calibration_table'] = int8_use_native_calibration_table
dla_enable = "true"
option['trt_dla_enable'] = dla_enable
option['trt_dla_core'] = new_dla_core
dump_subgraphs = "true"
option['trt_dump_subgraphs'] = dump_subgraphs
engine_cache_enable = "true"
option['trt_engine_cache_enable'] = engine_cache_enable
engine_cache_path = '/home/onnxruntime/engine_cache'
engine_cache_path = './engine_cache'
option['trt_engine_cache_path'] = engine_cache_path
engine_decryption_enable = "true"
option['trt_engine_decryption_enable'] = engine_decryption_enable
engine_decryption_lib_path = '/home/onnxruntime/decryption_lib'
option['trt_engine_decryption_lib_path'] = engine_decryption_lib_path
force_sequential_engine_build = "true"
option['trt_force_sequential_engine_build'] = force_sequential_engine_build
sess.set_providers(['TensorrtExecutionProvider'], [option])
@ -128,17 +103,9 @@ class TestInferenceSession(unittest.TestCase):
self.assertEqual(option['trt_max_partition_iterations'], str(new_max_partition_iterations))
self.assertEqual(option['trt_min_subgraph_size'], str(new_min_subgraph_size))
self.assertEqual(option['trt_max_workspace_size'], str(new_max_workspace_size))
self.assertEqual(option['trt_int8_calibration_table_name'], str(calib_table_name))
self.assertEqual(option['trt_fp16_enable'], '1')
self.assertEqual(option['trt_int8_enable'], '0')
self.assertEqual(option['trt_int8_use_native_calibration_table'], '1')
self.assertEqual(option['trt_dla_enable'], '1')
self.assertEqual(option['trt_dla_core'], str(new_dla_core))
self.assertEqual(option['trt_dump_subgraphs'], '1')
self.assertEqual(option['trt_engine_cache_enable'], '1')
self.assertEqual(option['trt_engine_cache_path'], str(engine_cache_path))
self.assertEqual(option['trt_engine_decryption_enable'], '1')
self.assertEqual(option['engine_decryption_lib_path'], str(engine_decryption_lib_path))
self.assertEqual(option['trt_force_sequential_engine_build'], '1')
# We currently disable following test code since that not all test machines/GPUs have nvidia int8 capability