From dfea92925c2cdb56e36d3f1d8a4f890e0e4b4d48 Mon Sep 17 00:00:00 2001 From: stevenlix <38092805+stevenlix@users.noreply.github.com> Date: Thu, 19 Nov 2020 17:10:49 -0800 Subject: [PATCH] Add calibration based INT8 quantization to TensorRT EP (#5842) * add int8 * support both native TRT cal table and ORT cal table * add more comments * Update env variable name and check platform availability for int8/fp16 --- .../TensorRT-ExecutionProvider.md | 38 ++- .../tensorrt/tensorrt_execution_provider.cc | 302 +++++++++++++++--- .../tensorrt/tensorrt_execution_provider.h | 19 +- 3 files changed, 297 insertions(+), 62 deletions(-) diff --git a/docs/execution_providers/TensorRT-ExecutionProvider.md b/docs/execution_providers/TensorRT-ExecutionProvider.md index e0e4e4a214..0b6a0f1cc8 100644 --- a/docs/execution_providers/TensorRT-ExecutionProvider.md +++ b/docs/execution_providers/TensorRT-ExecutionProvider.md @@ -53,28 +53,33 @@ When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest#onnxruntim ## Configuring environment variables There are several environment variables for TensorRT execution provider. -* ORT_TENSORRT_MAX_WORKSPACE_SIZE: maximum workspace size for TensorRT engine. +* ORT_TENSORRT_MAX_WORKSPACE_SIZE: maximum workspace size for TensorRT engine. Default value: 1073741824 (1GB). -* ORT_TENSORRT_MAX_PARTITION_ITERATIONS: maximum number of iterations allowed in model partitioning for TensorRT. If target model can't be successfully partitioned when the maximum number of iterations is reached, the whole model will fall back to other execution providers such as CUDA or CPU. +* ORT_TENSORRT_MAX_PARTITION_ITERATIONS: maximum number of iterations allowed in model partitioning for TensorRT. If target model can't be successfully partitioned when the maximum number of iterations is reached, the whole model will fall back to other execution providers such as CUDA or CPU. Default value: 1000. -* ORT_TENSORRT_MIN_SUBGRAPH_SIZE: minimum node size in a subgraph after partitioning. Subgraphs with smaller size will fall back to other execution providers. +* ORT_TENSORRT_MIN_SUBGRAPH_SIZE: minimum node size in a subgraph after partitioning. Subgraphs with smaller size will fall back to other execution providers. Default value: 1. -* ORT_TENSORRT_FP16_ENABLE: Enable FP16 mode in TensorRT +* ORT_TENSORRT_FP16_ENABLE: Enable FP16 mode in TensorRT. 1: enabled, 0: disabled. Default value: 0. -* ORT_TENSORRT_ENGINE_CACHE_ENABLE: Enable TensorRT engine caching. The purpose of using engine caching is to save engine build time in the cases that TensorRT may take long time to optimize and build engine. Engine will be cached after it's built at the first time so that next time when inference session is created the engine can be loaded directly from cache. In order to validate that the loaded engine is usable for current inference, engine profile is also cached and loaded along with engine. If current input shapes are in the range of the engine profile, that means the loaded engine can be safely used. Otherwise if input shapes are out of range, profile cache will be updated to cover the new shape and engine will be recreated based on the new profile (and also refreshed in the engine cache). Note each engine is created for specific settings such as precision (FP32/FP16/INT8 etc), workspace, profiles etc, and specific GPUs and it's not portable, so it's essential to make sure those settings are not changing, otherwise the engines need to be rebuilt and cached again. +* ORT_TENSORRT_INT8_ENABLE: Enable INT8 mode in TensorRT. 1: enabled, 0: disabled. Default value: 0. + +* ORT_TENSORRT_INT8_CALIBRATION_TABLE_NAME: Specify INT8 calibration table file name. By default the name is "INT8_calibration_table". + +* ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE: Select what calibration table is used. If 1, native TensorRT generated calibration table is used; if 0, ONNXRUNTIME tool generated calibration table is used. Default value: 0. +**Note: Please copy up-to-date calibration table file to ORT_TENSORRT_CACHE_PATH before inference. Calibration table is specific to models and calibration data sets. Whenever new calibration table is generated, old file in the path should be cleaned up or be replaced. + +* ORT_TENSORRT_ENGINE_CACHE_ENABLE: Enable TensorRT engine caching. The purpose of using engine caching is to save engine build time in the cases that TensorRT may take long time to optimize and build engine. Engine will be cached after it's built at the first time so that next time when inference session is created the engine can be loaded directly from cache. In order to validate that the loaded engine is usable for current inference, engine profile is also cached and loaded along with engine. If current input shapes are in the range of the engine profile, that means the loaded engine can be safely used. Otherwise if input shapes are out of range, profile cache will be updated to cover the new shape and engine will be recreated based on the new profile (and also refreshed in the engine cache). Note each engine is created for specific settings such as precision (FP32/FP16/INT8 etc), workspace, profiles etc, and specific GPUs and it's not portable, so it's essential to make sure those settings are not changing, otherwise the engines need to be rebuilt and cached again. 1: enabled, 0: disabled. Default value: 0. **Warning: Please clean up any old engine and profile cache files (.engine and .profile) if any of the following changes:** - Model changes (if there are any changes to the model topology, opset version etc.) - ORT version changes (i.e. moving from ORT version 1.4 to 1.5) - TensorRT version changes (i.e. moving from TensorRT 7.0 to 7.1) - Hardware changes. (Engine and profile files are not portable and optimized for specific Nvidia hardware) -* ORT_TENSORRT_ENGINE_CACHE_PATH: Specify path for TensorRT engine files if ORT_TENSORRT_ENGINE_CACHE_ENABLE is 1 +* ORT_TENSORRT_CACHE_PATH: Specify path for TensorRT engine and profile files if ORT_TENSORRT_ENGINE_CACHE_ENABLE is 1, or path for INT8 calibration table file if ORT_TENSORRT_INT8_ENABLE is 1. -* ORT_TENSORRT_DUMP_SUBGRAPHS: Dumps the subgraphs that are transformed into TRT engines in onnx format to the filesystem. This can help debugging subgraphs, e.g. by using `trtexec --onnx my_model.onnx` and check the outputs of the parser. +* ORT_TENSORRT_DUMP_SUBGRAPHS: Dumps the subgraphs that are transformed into TRT engines in onnx format to the filesystem. This can help debugging subgraphs, e.g. by using `trtexec --onnx my_model.onnx` and check the outputs of the parser. 1: enabled, 0: disabled. Default value: 0. -By default TensorRT execution provider builds an ICudaEngine with max workspace size = 1 GB, max partition iterations = 1000, min subgraph size = 1, FP16 mode is disabled and TensorRT engine caching is disabled. - -One can override these defaults by setting environment variables ORT_TENSORRT_MAX_WORKSPACE_SIZE, ORT_TENSORRT_MAX_PARTITION_ITERATIONS, ORT_TENSORRT_MIN_SUBGRAPH_SIZE, ORT_TENSORRT_FP16_ENABLE, ORT_TENSORRT_ENGINE_CACHE_ENABLE and ORT_TENSORRT_ENGINE_CACHE_PATH. +One can override default values by setting environment variables ORT_TENSORRT_MAX_WORKSPACE_SIZE, ORT_TENSORRT_MAX_PARTITION_ITERATIONS, ORT_TENSORRT_MIN_SUBGRAPH_SIZE, ORT_TENSORRT_FP16_ENABLE, ORT_TENSORRT_INT8_ENABLE, ORT_TENSORRT_INT8_CALIBRATION_TABLE_NAME, ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE, ORT_TENSORRT_ENGINE_CACHE_ENABLE, ORT_TENSORRT_CACHE_PATH and ORT_TENSORRT_DUMP_SUBGRAPHS. e.g. on Linux ### override default max workspace size to 2GB @@ -89,10 +94,19 @@ export ORT_TENSORRT_MIN_SUBGRAPH_SIZE=5 ### Enable FP16 mode in TensorRT export ORT_TENSORRT_FP16_ENABLE=1 +### Enable INT8 mode in TensorRT +export ORT_TENSORRT_INT8_ENABLE=1 + +### Use native TensorRT calibration table +export ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE=1 + ### Enable TensorRT engine caching export ORT_TENSORRT_ENGINE_CACHE_ENABLE=1 * Please Note warning above. This feature is experimental. Engine cache files must be invalidated if there are any changes to the model, ORT version, TensorRT version or if the underlying hardware changes. Engine files are not portable across devices. -### Specify TensorRT engine cache path -export ORT_TENSORRT_ENGINE_CACHE_PATH="/path/to/cache" +### Specify TensorRT cache path +export ORT_TENSORRT_CACHE_PATH="/path/to/cache" + +### Dump out subgraphs to run on TensorRT +export ORT_TENSORRT_DUMP_SUBGRAPHS = 1 \ No newline at end of file diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index be0ddf009f..941586b4be 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -8,7 +8,6 @@ #define ORT_API_MANUAL_INIT #include "core/session/onnxruntime_cxx_api.h" #include "core/common/safeint.h" - #include "tensorrt_execution_provider.h" #include "core/providers/cuda/shared_inc/cuda_call.h" #include "core/providers/cuda/math/unary_elementwise_ops_impl.h" @@ -49,9 +48,20 @@ std::string GetVecHash(const std::string& vec) { return std::to_string(ret); } +float ConvertSinglePrecisionIEEE754ToFloat(unsigned long input) { + int s = (input >> 31) & 0x01; + int e = ((input & 0x7f800000) >> 23) - 127; + int p = -1; + double m = 0.0; + for (int i = 0; i < 23; ++i) { + m += ((input >> (23 - i - 1)) & 0x01) * pow(2.0, p--); + } + return (s ? -1 : 1) * pow(2.0, e) * (m + 1.0); +} + /* * Seralize engine profile -* The profile contains min/max shape ranges of every dynamic shape dimension for each input tensor +* The profile contains min/max shape ranges of dynamic shape dimensions of each input tensor * For example, assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, and tensor_b * has one dynamic shape dimension: dim_1. The data in profile will be, * key: tensor_a, value: dim_0 min_shape max_shape dim_2 min_shape max_shape @@ -107,6 +117,148 @@ std::unordered_map* adjacency_map, bool visited[], bool* st, std::vector& cycles) { + if (!visited[i]) { + visited[i] = true; + st[i] = true; + for (auto iter = adjacency_map[i].begin(); iter != adjacency_map[i].end(); ++iter) { + if (!visited[*iter] && FindCycleHelper(*iter, adjacency_map, visited, st, cycles)) { + cycles.push_back(*iter); + return true; + } else if (st[*iter]) { + cycles.push_back(*iter); + return true; + } + } + } + st[i] = false; + return false; +} + +/* +* Read calibration table for INT8 quantization +* Two kind of calibration tables are supported, +* 1. ORT generated calibration table +* The table is pre-serialized by flexbuffers. +* Each entry in the table is a key-value pair, +* key: tensor name, value: maximum absolute value in floating point +* For example, +* data_0 2.008338 +* ... +* 2. Native TensorRT generated calibration table +* Data format is defined by TensorRT as, +* tensor name : scale in 32-bit single precision IEEE754 format +* For example, +* TRT-7103-EntropyCalibration2 +* data_0: 4000889d +* ... +*/ +bool ReadDynamicRange(const std::string file_name, const bool is_trt_calibration_table, std::unordered_map& dynamic_range_map) { + std::ifstream infile(file_name, std::ios::binary | std::ios::in); + if (!infile) { + return false; + } + + if (is_trt_calibration_table) { + // Native TensorRT generated calibration table + std::string line; + char delim = ':'; + if (std::getline(infile, line)) { + std::istringstream first_line(line); + std::string version; + std::getline(first_line, version, delim); + std::size_t found = version.find("TRT-"); + if (found != std::string::npos) { + while (std::getline(infile, line)) { + std::istringstream in_line(line); + std::string str; + std::getline(in_line, str, delim); + std::string tensor_name = str; + std::getline(in_line, str, delim); + unsigned long scale_int = std::strtoul(str.c_str(), nullptr, 16); + float scale_float = ConvertSinglePrecisionIEEE754ToFloat(scale_int); + float dynamic_range = scale_float * 127.0; + dynamic_range_map[tensor_name] = dynamic_range; + } + } else { + throw std::runtime_error("This is not a TensorRT generated calibration table " + file_name); + } + } + } else { + // ORT generated calibration table + infile.seekg(0, std::ios::end); + int length = infile.tellg(); + infile.seekg(0, std::ios::beg); + std::unique_ptr data{new char[length]}; + infile.read((char*)data.get(), length); + infile.close(); + auto dynamic_range_entries = flexbuffers::GetRoot((const uint8_t*)data.get(), length).AsMap(); + auto keys = dynamic_range_entries.Keys(); + auto values = dynamic_range_entries.Values(); + for (size_t i = 0, end = keys.size(); i < end; ++i) { + dynamic_range_map[keys[i].AsString().c_str()] = values[i].AsFloat(); + } + } + return true; +} + +bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map& dynamic_range_map) { + // Set dynamic range for input tensors + for (int i = 0; i < network.getNbInputs(); ++i) { + const std::string tensor_name = network.getInput(i)->getName(); + auto dynamic_range_iter = dynamic_range_map.find(tensor_name); + if (dynamic_range_iter != dynamic_range_map.end()) { + if (!network.getInput(i)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) { + return false; + } + } + } + + // Set dynamic range for activations and weights + for (int i = 0; i < network.getNbLayers(); ++i) { + auto trt_layer = network.getLayer(i); + for (int j = 0, e = trt_layer->getNbOutputs(); j < e; ++j) { + const std::string tensor_name = trt_layer->getOutput(j)->getName(); + auto dynamic_range_iter = dynamic_range_map.find(tensor_name); + if (dynamic_range_iter != dynamic_range_map.end()) { + if (!trt_layer->getOutput(j)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) { + return false; + } + } else if (trt_layer->getType() == nvinfer1::LayerType::kCONSTANT) { + nvinfer1::IConstantLayer* const_layer = static_cast(trt_layer); + auto trt_weights = const_layer->getWeights(); + double max_weight = std::numeric_limits::min(); + for (int64_t k = 0, end = trt_weights.count; k < end; ++k) { + double weight{}; + switch (trt_weights.type) { + case nvinfer1::DataType::kFLOAT: + weight = static_cast(trt_weights.values)[k]; + break; + case nvinfer1::DataType::kBOOL: + weight = static_cast(trt_weights.values)[k]; + break; + case nvinfer1::DataType::kINT8: + weight = static_cast(trt_weights.values)[k]; + break; + case nvinfer1::DataType::kHALF: + weight = static_cast(trt_weights.values)[k]; + break; + case nvinfer1::DataType::kINT32: + weight = static_cast(trt_weights.values)[k]; + break; + } + max_weight = std::max(max_weight, std::abs(weight)); + } + if (!trt_layer->getOutput(j)->setDynamicRange(-max_weight, max_weight)) { + return false; + } + } + } + } + return true; +} } // namespace namespace google { @@ -263,6 +415,23 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv fp16_enable_ = (std::stoi(fp16_enable_env) == 0 ? false : true); } + const std::string int8_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8Enable); + if (!int8_enable_env.empty()) { + int8_enable_ = (std::stoi(int8_enable_env) == 0 ? false : true); + } + + if (int8_enable_) { + const std::string int8_calibration_cache_name_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8CalibrationTableName); + if (!int8_calibration_cache_name_env.empty()) { + int8_calibration_cache_name_ = int8_calibration_cache_name_env; + } + + const std::string int8_use_native_tensorrt_calibration_table_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8UseNativeTensorrtCalibrationTable); + if (!int8_use_native_tensorrt_calibration_table_env.empty()) { + int8_use_native_tensorrt_calibration_table_ = (std::stoi(int8_use_native_tensorrt_calibration_table_env) == 0 ? false : true); + } + } + const std::string dump_subgraphs_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDumpSubgraphs); if (!dump_subgraphs_env.empty()) { dump_subgraphs_ = (std::stoi(dump_subgraphs_env) == 0 ? false : true); @@ -273,11 +442,11 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true); } - if (engine_cache_enable_) { - engine_cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath); - if (!engine_cache_path_.empty() && !fs::is_directory(engine_cache_path_)) { - if (!fs::create_directory(engine_cache_path_)) { - throw std::runtime_error("Failed to create directory " + engine_cache_path_); + if (engine_cache_enable_ || int8_enable_) { + cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath); + if (!cache_path_.empty() && !fs::is_directory(cache_path_)) { + if (!fs::create_directory(cache_path_)) { + throw std::runtime_error("Failed to create directory " + cache_path_); } } runtime_ = nvinfer1::createInferRuntime(GetTensorrtLogger()); @@ -326,25 +495,6 @@ void ToGraphProtoInternal(const onnxruntime::Provider_GraphViewer& graph, Provid } } -// Check if cycle exists in the graph after partitioning -bool FindCycleHelper(int i, const std::list* adjacency_map, bool visited[], bool* st, std::vector& cycles) { - if (!visited[i]) { - visited[i] = true; - st[i] = true; - for (auto iter = adjacency_map[i].begin(); iter != adjacency_map[i].end(); ++iter) { - if (!visited[*iter] && FindCycleHelper(*iter, adjacency_map, visited, st, cycles)) { - cycles.push_back(*iter); - return true; - } else if (st[*iter]) { - cycles.push_back(*iter); - return true; - } - } - } - st[i] = false; - return false; -} - std::unique_ptr TensorrtExecutionProvider::GetSubGraph(SubGraph_t graph_nodes_index, int& kernels_index, const onnxruntime::Provider_GraphViewer& graph) const { const std::vector& node_index = graph.GetNodesInTopologicalOrder(); std::unordered_set node_set; @@ -723,7 +873,8 @@ std::vector> TensorrtExecutionProvider::Provider_GetCapability(const onnxruntime::Provider_GraphViewer& graph, const std::vector& /*kernel_registries*/) const { // Get supported node list from TensorRT parser - std::vector nodes_vector(graph.NumberOfNodes()); + const int number_of_ort_nodes = graph.NumberOfNodes(); + std::vector nodes_vector(number_of_ort_nodes); std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0); SubGraphCollection_t supported_nodes_vector, parser_nodes_vector = {{nodes_vector, false}}; bool early_termination = false; @@ -755,10 +906,12 @@ TensorrtExecutionProvider::Provider_GetCapability(const onnxruntime::Provider_Gr } const int number_of_subgraphs = supported_nodes_vector.size(); - if (number_of_subgraphs == 0) { - LOGS_DEFAULT(WARNING) << "No graph is running on TensorRT exeuction provider."; + if (number_of_trt_nodes == 0) { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] No graph will run on TensorRT exeuction provider"; + } else if (number_of_trt_nodes == number_of_ort_nodes) { + LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT exeuction provider"; } else { - LOGS_DEFAULT(INFO) << "Number of subgraphs running on TensorRT exeuction provider: " << number_of_subgraphs; + LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT exeuction provider is " << number_of_subgraphs; } return result; @@ -804,7 +957,6 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorSerializeToOstream(dump); } - // Create TensorRT engine TensorrtLogger& trt_logger = GetTensorrtLogger(); auto trt_builder = tensorrt_ptr::unique_pointer(nvinfer1::createInferBuilder(trt_logger)); const auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); @@ -843,21 +995,54 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorplatformHasFastFp16()) { + fp16_enable_ = false; + LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_FP16_ENABLE is set, but platform doesn't support fast native fp16"; + } + } + + if (int8_enable_) { + if (!trt_builder->platformHasFastInt8()) { + int8_enable_ = false; + LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_INT8_ENABLE is set, but platform doesn't support fast native int8"; + } + } + + // Load INT8 calibration table + std::unordered_map dynamic_range_map; + if (int8_enable_) { + const std::string calibration_cache_path = GetCachePath(cache_path_, int8_calibration_cache_name_); + if (!ReadDynamicRange(calibration_cache_path, int8_use_native_tensorrt_calibration_table_, dynamic_range_map)) { + throw std::runtime_error("Failed to read INT8 calibration table " + calibration_cache_path); + } + } + + // Set precision flags std::string trt_node_name_with_precision = fused_node->Name(); - if (fp16_enable_ && trt_builder->platformHasFastFp16()) { + if (fp16_enable_ && int8_enable_) { + trt_config->setFlags(1U << static_cast(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast(nvinfer1::BuilderFlag::kINT8)); + trt_node_name_with_precision += "_fp16_int8"; + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 and INT8 mode is enabled"; + } else if (fp16_enable_) { trt_config->setFlag(nvinfer1::BuilderFlag::kFP16); trt_node_name_with_precision += "_fp16"; - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 mode is enabled."; + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 mode is enabled"; + } else if (int8_enable_) { + trt_config->setFlag(nvinfer1::BuilderFlag::kINT8); + trt_node_name_with_precision += "_int8"; + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] INT8 mode is enabled"; } int num_nodes = graph_body_viewer->NumberOfNodes(); trt_node_name_with_precision += "_" + GetVecHash(trt_node_name_with_precision + std::to_string(num_nodes)); - + // Build TRT engine here if the graph doesn't have dynamic shape input. Otherwise engine will // be built at runtime tensorrt_ptr::unique_pointer trt_engine; tensorrt_ptr::unique_pointer trt_context; if (!has_dynamic_shape) { - const std::string cache_path = GetCachePath(engine_cache_path_, trt_node_name_with_precision); + const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision); const std::string engine_cache_path = cache_path + ".engine"; std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); if (engine_cache_enable_ && engine_file) { @@ -869,6 +1054,16 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vector(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr)); LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path; } else { + // Set INT8 per tensor dynamic range + if (int8_enable_ && trt_builder->platformHasFastInt8()) { + trt_config->setInt8Calibrator(nullptr); + if (!SetDynamicRange(*trt_network, dynamic_range_map)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not set INT8 dynamic range for fused node: " + fused_node->Name()); + } + } + + // Build engine trt_engine = tensorrt_ptr::unique_pointer(trt_builder->buildEngineWithConfig(*trt_network, *trt_config)); if (trt_engine == nullptr) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, @@ -882,6 +1077,8 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vector(trt_engine->createExecutionContext()); if (trt_context == nullptr) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, @@ -899,7 +1096,7 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorgraph().output(); for (int i = 0; i < num_outputs; ++i) { const std::string& output_name = trt_network->getOutput(i)->getName(); @@ -930,9 +1127,9 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorallocate_func, context->release_func, context->allocator_handle, &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], - input_shape_ranges_[context->node_name], &tensorrt_mu_, &fp16_enable_, &max_workspace_size_, - trt_node_name_with_precision, engine_cache_enable_, engine_cache_path_, runtime_, - allocator_}; + input_shape_ranges_[context->node_name], &tensorrt_mu_, &fp16_enable_, &int8_enable_, &max_workspace_size_, + trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_, + allocator_, dynamic_range_map}; *state = p.release(); return 0; }; @@ -967,7 +1164,7 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorengine_cache_path, trt_state->trt_node_name_with_precision); const std::string engine_cache_path = cache_path + ".engine"; const std::string profile_cache_path = cache_path + ".profile"; - std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); + std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in); if (engine_file && profile_file && (trt_state->engine_cache_enable && trt_engine == nullptr)) { // Deserialize profile @@ -1011,7 +1208,7 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorsecond; } - + const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index); auto tensor_info = ort.GetTensorTypeAndShape(input_tensor); const auto& tensor_shapes = ort.GetTensorShape(tensor_info); @@ -1092,7 +1289,7 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorsetShapeValues(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, &shapes_min[0], shape_size); trt_profile->setShapeValues(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, &shapes_opt[0], shape_size); trt_profile->setShapeValues(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, &shapes_max[0], shape_size); - } else { // execution tensor + } else { // Execution tensor nvinfer1::Dims dims_min(dims), dims_opt(dims), dims_max(dims); for (int j = 0, end = nb_dims; j < end; ++j) { const auto& tensor_shape = tensor_shapes[j]; @@ -1140,10 +1337,25 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vector(trt_builder->createBuilderConfig()); trt_config->setMaxWorkspaceSize(*(trt_state->max_workspace_size_ptr)); trt_config->addOptimizationProfile(trt_profile); - if (*(trt_state->fp16_enable_ptr) && trt_builder->platformHasFastFp16()) { - trt_config->setFlag(nvinfer1::BuilderFlag::kFP16); + + // Set INT8 Per Tensor Dynamic range + if (*(trt_state->int8_enable_ptr) && trt_builder->platformHasFastInt8()) { + trt_config->setInt8Calibrator(nullptr); + if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range."); + } } + // Set precision + if (*(trt_state->fp16_enable_ptr) && *(trt_state->int8_enable_ptr)) { + trt_config->setFlags(1U << static_cast(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast(nvinfer1::BuilderFlag::kINT8)); + } else if (*(trt_state->fp16_enable_ptr)) { + trt_config->setFlag(nvinfer1::BuilderFlag::kFP16); + } else if (*(trt_state->int8_enable_ptr)) { + trt_config->setFlag(nvinfer1::BuilderFlag::kINT8); + } + + // Build engine *(trt_state->engine) = tensorrt_ptr::unique_pointer( trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config)); if (trt_state->engine->get() == nullptr) { @@ -1154,6 +1366,7 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorserialize(); std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out); @@ -1162,6 +1375,7 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vectorcontext) = tensorrt_ptr::unique_pointer( trt_state->engine->get()->createExecutionContext()); if (trt_state->context->get() == nullptr) { diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index ea771c7c5f..77a9ae0df3 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -14,9 +14,12 @@ static const std::string kMaxPartitionIterations = "ORT_TENSORRT_MAX_PARTITION_I static const std::string kMinSubgraphSize = "ORT_TENSORRT_MIN_SUBGRAPH_SIZE"; static const std::string kMaxWorkspaceSize = "ORT_TENSORRT_MAX_WORKSPACE_SIZE"; static const std::string kFP16Enable = "ORT_TENSORRT_FP16_ENABLE"; +static const std::string kINT8Enable = "ORT_TENSORRT_INT8_ENABLE"; +static const std::string kINT8CalibrationTableName = "ORT_TENSORRT_INT8_CALIBRATION_TABLE_NAME"; +static const std::string kINT8UseNativeTensorrtCalibrationTable = "ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE"; static const std::string kDumpSubgraphs = "ORT_TENSORRT_DUMP_SUBGRAPHS"; static const std::string kEngineCacheEnable = "ORT_TENSORRT_ENGINE_CACHE_ENABLE"; -static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH"; +static const std::string kCachePath = "ORT_TENSORRT_CACHE_PATH"; } // namespace tensorrt_env_vars class TensorrtLogger : public nvinfer1::ILogger { @@ -79,12 +82,14 @@ struct TensorrtFuncState { std::unordered_map>> input_shape_ranges; OrtMutex* tensorrt_mu_ptr = nullptr; bool* fp16_enable_ptr = nullptr; + bool* int8_enable_ptr = nullptr; size_t* max_workspace_size_ptr = nullptr; std::string trt_node_name_with_precision; bool engine_cache_enable; std::string engine_cache_path; nvinfer1::IRuntime* runtime = nullptr; AllocatorPtr scratch_allocator; + std::unordered_map dynamic_range_map; }; // Logical device representation. @@ -108,13 +113,16 @@ class TensorrtExecutionProvider : public Provider_IExecutionProvider { AllocatorPtr Provider_GetAllocator(int id, OrtMemType mem_type) const override; private: - size_t max_workspace_size_ = 1 << 30; // 1GB int max_partition_iterations_ = 1000; - int min_subgraph_size_ = 1; - bool fp16_enable_ = false; + int min_subgraph_size_ = 1; + size_t max_workspace_size_ = 1 << 30; // 1GB + bool fp16_enable_ = false; + bool int8_enable_ = false; + std::string int8_calibration_cache_name_ = "INT8_calibration_table"; + bool int8_use_native_tensorrt_calibration_table_ = false; bool dump_subgraphs_ = false; bool engine_cache_enable_ = false; - std::string engine_cache_path_; + std::string cache_path_; nvinfer1::IRuntime* runtime_ = nullptr; OrtMutex tensorrt_mu_; @@ -146,5 +154,4 @@ class TensorrtExecutionProvider : public Provider_IExecutionProvider { AllocatorPtr allocator_; }; - } // namespace onnxruntime \ No newline at end of file