Add DLA support to TensorRT EP (#7532)

* Add DLA to TensorRT EP, enable device_id options in pybind, fix cycledetection issue

* fix format

* remove unecessary passing by pointer

* fix issue
This commit is contained in:
stevenlix 2021-05-07 10:31:42 -07:00 committed by GitHub
parent 9fc4116d51
commit 8ab0deceed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 67 additions and 17 deletions

View file

@ -499,6 +499,20 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
}
engine_decryption_ = (int (*)(const char*, char*, size_t*))LIBFUNC(handle, "decrypt");
}
if (fp16_enable_ || int8_enable_) { // DLA can only be enabled with FP16 or INT8
const std::string dla_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLAEnable);
if (!dla_enable_env.empty()) {
dla_enable_ = (std::stoi(dla_enable_env) == 0 ? false : true);
}
if (dla_enable_) {
const std::string dla_core_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDLACore);
if (!dla_core_env.empty()) {
dla_core_ = std::stoi(dla_core_env);
}
}
}
}
TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@ -948,13 +962,9 @@ void TensorrtExecutionProvider::RemoveTensorRTGraphCycles(SubGraphCollection_t&
for (int i = 0; i < static_cast<int>(cycles.size()); ++i) {
auto loc = index_to_node_map.find(cycles[i]);
if (loc != index_to_node_map.end() && loc->second.find("TRTKernel") != std::string::npos) {
std::size_t found = loc->second.rfind("_");
if (found != std::string::npos) {
int trt_node_index = std::stoi(loc->second.substr(found + 1));
supported_nodes_vector.erase(supported_nodes_vector.begin() + trt_node_index);
trt_cycle = true;
break;
}
supported_nodes_vector.erase(supported_nodes_vector.begin() + cycles[i]);
trt_cycle = true;
break;
}
}
}
@ -1145,6 +1155,27 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] INT8 mode is enabled";
}
// Set DLA
if (fp16_enable_ || int8_enable_) {
if (dla_enable_ && dla_core_ >= 0) {//DLA can only run with FP16 and INT8
int number_of_dla_core = trt_builder->getNbDLACores();
if (number_of_dla_core == 0) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core";
dla_enable_ = false;
} else {
if (dla_core_ >= number_of_dla_core) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead.";
dla_core_ = 0;
}
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << dla_core_;
trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
trt_config->setDLACore(dla_core_);
trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_);
}
}
}
// Build TRT engine here if the graph doesn't have dynamic shape input. Otherwise engine will
// be built at runtime
tensorrt_ptr::unique_pointer<nvinfer1::ICudaEngine> trt_engine;
@ -1261,8 +1292,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
*p = {context->allocate_func, context->release_func, context->allocator_handle, &parsers_[context->node_name],
&engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name],
&networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
input_shape_ranges_[context->node_name], &tensorrt_mu_, &fp16_enable_, &int8_enable_, &max_workspace_size_,
trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), nullptr,
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, dla_enable_,
dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), nullptr,
allocator_, dynamic_range_map, engine_decryption_enable_, engine_decryption_};
*state = p.release();
return 0;
@ -1504,7 +1535,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
trt_config->addOptimizationProfile(*trt_profile);
// Set INT8 Per Tensor Dynamic range
if (*(trt_state->int8_enable_ptr) && trt_builder->platformHasFastInt8()) {
if (trt_state->int8_enable && trt_builder->platformHasFastInt8()) {
trt_config->setInt8Calibrator(nullptr);
if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
@ -1512,14 +1543,22 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
}
// Set precision
if (*(trt_state->fp16_enable_ptr) && *(trt_state->int8_enable_ptr)) {
if (trt_state->fp16_enable && trt_state->int8_enable) {
trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
} else if (*(trt_state->fp16_enable_ptr)) {
} else if (trt_state->fp16_enable) {
trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
} else if (*(trt_state->int8_enable_ptr)) {
} else if (trt_state->int8_enable) {
trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
}
// Set DLA (DLA can only run with FP16 or INT8)
if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
trt_config->setDLACore(trt_state->dla_core);
}
// Build engine
{
auto lock = GetEngineBuildLock();

View file

@ -25,6 +25,8 @@ static const std::string kForceSequentialEngineBuild= "ORT_TENSORRT_FORCE_SEQUEN
static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
static const std::string kDecryptionEnable = "ORT_TENSORRT_ENGINE_DECRYPTION_ENABLE";
static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LIB_PATH";
static const std::string kDLAEnable = "ORT_TENSORRT_DLA_ENABLE";
static const std::string kDLACore = "ORT_TENSORRT_DLA_CORE";
} // namespace tensorrt_env_vars
class TensorrtLogger : public nvinfer1::ILogger {
@ -95,14 +97,15 @@ struct TensorrtFuncState {
std::vector<std::unordered_map<std::string, int>> output_info;
std::unordered_map<std::string, std::unordered_map<int, std::pair<int64_t, int64_t>>> input_shape_ranges;
OrtMutex* tensorrt_mu_ptr = nullptr;
bool* fp16_enable_ptr = nullptr;
bool* int8_enable_ptr = nullptr;
bool fp16_enable;
bool int8_enable;
bool dla_enable;
int dla_core;
size_t* max_workspace_size_ptr = nullptr;
std::string trt_node_name_with_precision;
bool engine_cache_enable;
std::string engine_cache_path;
nvinfer1::IRuntime* runtime = nullptr;
nvinfer1::IOptimizationProfile* trt_profile = nullptr;
AllocatorPtr scratch_allocator;
std::unordered_map<std::string, float> dynamic_range_map;
@ -146,6 +149,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
size_t max_workspace_size_ = 1 << 30; // 1GB
bool fp16_enable_ = false;
bool int8_enable_ = false;
bool dla_enable_ = false;
int dla_core_ = 0;
bool force_sequential_engine_build_ = false;
std::string int8_calibration_cache_name_ = "INT8_calibration_table";
bool int8_use_native_tensorrt_calibration_table_ = false;

View file

@ -578,7 +578,13 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
auto it = provider_options_map.find(type);
if (it != provider_options_map.end()) {
for (auto option : it->second) {
if (option.first == "has_trt_options") {
if (option.first == "device_id") {
if (!option.second.empty()) {
params.device_id = std::stoi(option.second);
} else {
ORT_THROW("[ERROR] [TensorRT] The value for the key 'device_id' should be a number i.e. '0'.\n");
}
} else if (option.first == "has_trt_options") {
if (option.second == "True" || option.second == "true") {
params.has_trt_options = true;
} else if (option.second == "False" || option.second == "false") {