mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-29 03:30:52 +00:00
Fix issues in TensorRT provider options (#7738)
* add legacy env variable support in pybind * formating code
This commit is contained in:
parent
e9057d2e49
commit
a6972c8782
4 changed files with 66 additions and 82 deletions
|
|
@ -145,7 +145,7 @@ bool FindCycleHelper(int i, const std::list<int>* adjacency_map, bool visited[],
|
|||
* Read calibration table for INT8 quantization
|
||||
* Two kind of calibration tables are supported,
|
||||
* 1. ORT generated calibration table
|
||||
* The table is pre-serialized by flatbuffers.
|
||||
* The table is pre-serialized by flatbuffers.
|
||||
* Each entry in the table is a key-value pair,
|
||||
* key: tensor name, value: maximum absolute value in floating point
|
||||
* For example,
|
||||
|
|
@ -404,7 +404,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
int8_calibration_cache_name_ = info.int8_calibration_table_name;
|
||||
int8_use_native_tensorrt_calibration_table_ = info.int8_use_native_calibration_table;
|
||||
}
|
||||
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
|
||||
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
|
||||
dla_enable_ = info.dla_enable;
|
||||
dla_core_ = info.dla_core;
|
||||
}
|
||||
|
|
@ -456,13 +456,13 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
}
|
||||
}
|
||||
|
||||
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
|
||||
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
|
||||
const std::string dla_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLAEnable);
|
||||
if (!dla_enable_env.empty()) {
|
||||
dla_enable_ = (std::stoi(dla_enable_env) == 0 ? false : true);
|
||||
}
|
||||
|
||||
if (dla_enable_) {
|
||||
|
||||
if (dla_enable_) {
|
||||
const std::string dla_core_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLACore);
|
||||
if (!dla_core_env.empty()) {
|
||||
dla_core_ = std::stoi(dla_core_env);
|
||||
|
|
@ -488,12 +488,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_ENGINE_CACHE_PATH is deprecated! Please use ORT_TENSORRT_CACHE_PATH to specify engine cache path";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const std::string engine_decryption_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDecryptionEnable);
|
||||
if (!engine_decryption_enable_env.empty()) {
|
||||
engine_decryption_enable_ = (std::stoi(engine_decryption_enable_env) == 0 ? false : true);
|
||||
}
|
||||
|
||||
|
||||
if (engine_decryption_enable_) {
|
||||
engine_decryption_lib_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDecryptionLibPath);
|
||||
}
|
||||
|
|
@ -510,15 +510,15 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
max_partition_iterations_ = 1000;
|
||||
}
|
||||
if (min_subgraph_size_ <= 0) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1";
|
||||
min_subgraph_size_ = 1;
|
||||
}
|
||||
}
|
||||
if (max_workspace_size_ <= 0) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)";
|
||||
max_workspace_size_ = 1 << 30;
|
||||
}
|
||||
if (dla_core_ < 0) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0";
|
||||
dla_core_ = 0;
|
||||
}
|
||||
|
||||
|
|
@ -534,11 +534,28 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
if (engine_decryption_enable_) {
|
||||
LIBTYPE handle = OPENLIB(engine_decryption_lib_path_.c_str());
|
||||
if (handle == nullptr) {
|
||||
ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
|
||||
"TensorRT EP could not open shared library from " + engine_decryption_lib_path_);
|
||||
ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
|
||||
"TensorRT EP could not open shared library from " + engine_decryption_lib_path_));
|
||||
}
|
||||
engine_decryption_ = (int (*)(const char*, char*, size_t*))LIBFUNC(handle, "decrypt");
|
||||
}
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT provider options: "
|
||||
<< "device_id: " << device_id_
|
||||
<< ", trt_max_partition_iterations: " << max_partition_iterations_
|
||||
<< ", trt_min_subgraph_size: " << min_subgraph_size_
|
||||
<< ", trt_max_workspace_size: " << max_workspace_size_
|
||||
<< ", trt_fp16_enable: " << fp16_enable_
|
||||
<< ", trt_int8_enable: " << int8_enable_
|
||||
<< ", trt_int8_calibration_cache_name: " << int8_calibration_cache_name_
|
||||
<< ", trt_int8_use_native_tensorrt_calibration_table: " << int8_use_native_tensorrt_calibration_table_
|
||||
<< ", trt_dla_enable: " << dla_enable_
|
||||
<< ", trt_dla_core: " << dla_core_
|
||||
<< ", trt_dump_subgraphs: " << dump_subgraphs_
|
||||
<< ", trt_engine_cache_enable: " << engine_cache_enable_
|
||||
<< ", trt_cache_path: " << cache_path_
|
||||
<< ", trt_engine_decryption_enable: " << engine_decryption_enable_
|
||||
<< ", trt_engine_decryption_lib_path: " << engine_decryption_lib_path_
|
||||
<< ", trt_force_sequential_engine_build: " << force_sequential_engine_build_;
|
||||
}
|
||||
|
||||
TensorrtExecutionProvider::~TensorrtExecutionProvider() {
|
||||
|
|
@ -862,7 +879,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
|
|||
model_proto->SerializeToString(string_buf);
|
||||
|
||||
if (dump_subgraphs_) {
|
||||
// Dump TensorRT subgraph for debugging if enabled via ORT_TENSORRT_DUMP_SUBGRAPHS env variable.
|
||||
// Dump TensorRT subgraph for debugging
|
||||
std::fstream dump("TensorrtExecutionProvider_TRT_Subgraph.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
|
||||
model_proto->SerializeToOstream(dump);
|
||||
}
|
||||
|
|
@ -1098,7 +1115,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
|
|||
model_proto->SerializeToString(string_buf);
|
||||
|
||||
if (dump_subgraphs_) {
|
||||
// Dump the TensorRT subgraph if enabled via ORT_TENSORRT_DUMP_SUBGRAPHS env variable.
|
||||
// Dump TensorRT subgraphs
|
||||
std::fstream dump(fused_node->Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
|
||||
model_proto->SerializeToOstream(dump);
|
||||
}
|
||||
|
|
@ -1183,11 +1200,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
|
|||
|
||||
// Set DLA
|
||||
if (fp16_enable_ || int8_enable_) {
|
||||
if (dla_enable_ && dla_core_ >= 0) {//DLA can only run with FP16 and INT8
|
||||
if (dla_enable_ && dla_core_ >= 0) { //DLA can only run with FP16 and INT8
|
||||
int number_of_dla_core = trt_builder->getNbDLACores();
|
||||
if (number_of_dla_core == 0) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core";
|
||||
dla_enable_ = false;
|
||||
dla_enable_ = false;
|
||||
} else {
|
||||
if (dla_core_ >= number_of_dla_core) {
|
||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead.";
|
||||
|
|
@ -1198,7 +1215,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
|
|||
trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
|
||||
trt_config->setDLACore(dla_core_);
|
||||
trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1318,7 +1335,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
|
|||
*p = {context->allocate_func, context->release_func, context->allocator_handle, &parsers_[context->node_name],
|
||||
&engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name],
|
||||
&networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
|
||||
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_,
|
||||
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_,
|
||||
dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), nullptr,
|
||||
allocator_, dynamic_range_map, engine_decryption_enable_, engine_decryption_};
|
||||
*state = p.release();
|
||||
|
|
@ -1579,10 +1596,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
|
|||
|
||||
// Set DLA (DLA can only run with FP16 or INT8)
|
||||
if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
|
||||
trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
|
||||
trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
|
||||
trt_config->setDLACore(trt_state->dla_core);
|
||||
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
|
||||
trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
|
||||
trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
|
||||
trt_config->setDLACore(trt_state->dla_core);
|
||||
}
|
||||
|
||||
// Build engine
|
||||
|
|
@ -1883,4 +1900,4 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
|
|||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace onnxruntime
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -208,6 +208,7 @@ const OrtDevice::DeviceType OrtDevice::GPU;
|
|||
namespace onnxruntime {
|
||||
|
||||
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params);
|
||||
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(int device_id);
|
||||
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(int device_id);
|
||||
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena);
|
||||
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params);
|
||||
|
|
@ -575,29 +576,28 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
|
|||
sess->GetSessionOptions().enable_cpu_mem_arena));
|
||||
} else if (type == kTensorrtExecutionProvider) {
|
||||
#ifdef USE_TENSORRT
|
||||
OrtTensorRTProviderOptions params{
|
||||
0,
|
||||
0,
|
||||
nullptr,
|
||||
1000,
|
||||
1,
|
||||
1 << 30,
|
||||
0,
|
||||
0,
|
||||
nullptr,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
0};
|
||||
|
||||
std::string calibration_table, cache_path, lib_path;
|
||||
auto it = provider_options_map.find(type);
|
||||
if (it != provider_options_map.end()) {
|
||||
OrtTensorRTProviderOptions params{
|
||||
0,
|
||||
0,
|
||||
nullptr,
|
||||
1000,
|
||||
1,
|
||||
1 << 30,
|
||||
0,
|
||||
0,
|
||||
nullptr,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
0};
|
||||
for (auto option : it->second) {
|
||||
if (option.first == "device_id") {
|
||||
if (!option.second.empty()) {
|
||||
|
|
@ -718,8 +718,10 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
|
|||
ORT_THROW("Invalid TensorRT EP option: ", option.first);
|
||||
}
|
||||
}
|
||||
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(¶ms));
|
||||
} else {
|
||||
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(cuda_device_id));
|
||||
}
|
||||
RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(¶ms));
|
||||
#endif
|
||||
} else if (type == kMIGraphXExecutionProvider) {
|
||||
#ifdef USE_MIGRAPHX
|
||||
|
|
|
|||
|
|
@ -76,8 +76,6 @@ namespace perftest {
|
|||
"\t [TensorRT only] [trt_dump_subgraphs]: Dump TRT subgraph to onnx model.\n"
|
||||
"\t [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n"
|
||||
"\t [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n"
|
||||
"\t [TensorRT only] [trt_engine_decryption_enable]: (experimental feature) Enable engine decryption.\n"
|
||||
"\t [TensorRT only] [trt_engine_decryption_lib_path]: (experimental feature) Specify engine decryption library path.\n"
|
||||
"\t [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
|
||||
"\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
|
||||
"\t [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
|
||||
|
|
|
|||
|
|
@ -72,53 +72,28 @@ class TestInferenceSession(unittest.TestCase):
|
|||
self.assertIn('trt_max_partition_iterations', option)
|
||||
self.assertIn('trt_min_subgraph_size', option)
|
||||
self.assertIn('trt_max_workspace_size', option)
|
||||
self.assertIn('trt_fp16_enable', option)
|
||||
self.assertIn('trt_int8_enable', option)
|
||||
self.assertIn('trt_int8_calibration_table_name', option)
|
||||
self.assertIn('trt_int8_use_native_calibration_table', option)
|
||||
self.assertIn('trt_dla_enable', option)
|
||||
self.assertIn('trt_dla_core', option)
|
||||
self.assertIn('trt_dump_subgraphs', option)
|
||||
self.assertIn('trt_engine_cache_enable', option)
|
||||
self.assertIn('trt_engine_cache_path', option)
|
||||
self.assertIn('trt_engine_decryption_enable', option)
|
||||
self.assertIn('trt_engine_decryption_lib_path', option)
|
||||
self.assertIn('trt_force_sequential_engine_build', option)
|
||||
|
||||
max_partition_iterations = option['trt_max_partition_iterations']
|
||||
new_max_partition_iterations = int(max_partition_iterations) + 1
|
||||
min_subgraph_size = option['trt_min_subgraph_size']
|
||||
new_min_subgraph_size = int(max_partition_iterations) + 1
|
||||
new_min_subgraph_size = int(min_subgraph_size) + 1
|
||||
ori_max_workspace_size = option['trt_max_workspace_size']
|
||||
new_max_workspace_size = int(ori_max_workspace_size) // 2
|
||||
dla_core = option['trt_dla_core']
|
||||
new_dla_core = int(dla_core) + 1
|
||||
|
||||
option = {}
|
||||
option['trt_max_partition_iterations'] = new_max_partition_iterations
|
||||
option['trt_min_subgraph_size'] = new_min_subgraph_size
|
||||
option['trt_max_workspace_size'] = new_max_workspace_size
|
||||
fp16_enable = "true"
|
||||
option['trt_fp16_enable'] = fp16_enable
|
||||
int8_enable = "false"
|
||||
option['trt_int8_enable'] = int8_enable
|
||||
calib_table_name = '/home/onnxruntime/table.flatbuffers'
|
||||
option['trt_int8_calibration_table_name'] = calib_table_name
|
||||
int8_use_native_calibration_table = "true"
|
||||
option['trt_int8_use_native_calibration_table'] = int8_use_native_calibration_table
|
||||
dla_enable = "true"
|
||||
option['trt_dla_enable'] = dla_enable
|
||||
option['trt_dla_core'] = new_dla_core
|
||||
dump_subgraphs = "true"
|
||||
option['trt_dump_subgraphs'] = dump_subgraphs
|
||||
engine_cache_enable = "true"
|
||||
option['trt_engine_cache_enable'] = engine_cache_enable
|
||||
engine_cache_path = '/home/onnxruntime/engine_cache'
|
||||
engine_cache_path = './engine_cache'
|
||||
option['trt_engine_cache_path'] = engine_cache_path
|
||||
engine_decryption_enable = "true"
|
||||
option['trt_engine_decryption_enable'] = engine_decryption_enable
|
||||
engine_decryption_lib_path = '/home/onnxruntime/decryption_lib'
|
||||
option['trt_engine_decryption_lib_path'] = engine_decryption_lib_path
|
||||
force_sequential_engine_build = "true"
|
||||
option['trt_force_sequential_engine_build'] = force_sequential_engine_build
|
||||
sess.set_providers(['TensorrtExecutionProvider'], [option])
|
||||
|
|
@ -128,17 +103,9 @@ class TestInferenceSession(unittest.TestCase):
|
|||
self.assertEqual(option['trt_max_partition_iterations'], str(new_max_partition_iterations))
|
||||
self.assertEqual(option['trt_min_subgraph_size'], str(new_min_subgraph_size))
|
||||
self.assertEqual(option['trt_max_workspace_size'], str(new_max_workspace_size))
|
||||
self.assertEqual(option['trt_int8_calibration_table_name'], str(calib_table_name))
|
||||
self.assertEqual(option['trt_fp16_enable'], '1')
|
||||
self.assertEqual(option['trt_int8_enable'], '0')
|
||||
self.assertEqual(option['trt_int8_use_native_calibration_table'], '1')
|
||||
self.assertEqual(option['trt_dla_enable'], '1')
|
||||
self.assertEqual(option['trt_dla_core'], str(new_dla_core))
|
||||
self.assertEqual(option['trt_dump_subgraphs'], '1')
|
||||
self.assertEqual(option['trt_engine_cache_enable'], '1')
|
||||
self.assertEqual(option['trt_engine_cache_path'], str(engine_cache_path))
|
||||
self.assertEqual(option['trt_engine_decryption_enable'], '1')
|
||||
self.assertEqual(option['engine_decryption_lib_path'], str(engine_decryption_lib_path))
|
||||
self.assertEqual(option['trt_force_sequential_engine_build'], '1')
|
||||
|
||||
# We currently disable following test code since that not all test machines/GPUs have nvidia int8 capability
|
||||
|
|
|
|||
Loading…
Reference in a new issue