From a8f74e3ec758c4d0a10e41dba87100eb7fe6a649 Mon Sep 17 00:00:00 2001 From: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Date: Thu, 18 Apr 2024 14:03:04 -0700 Subject: [PATCH] [TensorRT EP] TensorRT 10 support (#20167) This PR has the change of supporting INT64 tensor type for TRT 10. This PR is also **compatible with TRT 8.6 and TRT 10** meaning user can build ORT TRT against TRT 8.6 or TRT 10. Due to the timeline for TRT 10 GA and ORT 1.18 release is very tight (We don't have enough time to get our CIs installed with TRT 10 GA libraries and run the build/tests), as well as Nvidia new Triton release (The timeline is also very close to the timeline of TRT 10 GA) wants to integrate TRT EP with TRT 10. Therefore, our approach is to make this PR into ORT 1.18 first, so everything is fully tested with TRT 8.6 CIs, and user can still manually build ORT 1.18 against TRT 10 like the Triton case. As for testing TRT 10, once TRT 10 GA is released, we will have another branch which includes change at this PR as well as whatever changes needed and update our CIs with TRT 10. --- .../tensorrt/tensorrt_execution_provider.cc | 204 +++++++++++------- .../tensorrt/tensorrt_execution_provider.h | 4 + 2 files changed, 127 insertions(+), 81 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index ac9d9f7288..f33e9a968c 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -399,43 +399,17 @@ std::unique_lock TensorrtExecutionProvider::GetApiLock() const { /* * Get the shape of "shape tensor" input */ +template Status GetShapeOfShapeTensor(Ort::ConstValue& input_tensor, - std::vector& shape_values, - nvinfer1::ICudaEngine* trt_engine, - const char* input_name, + void* shape_values, + int shape_size, cudaStream_t stream) { - auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo(); - const auto tensor_shapes = tensor_info.GetShape(); - const auto tensor_type = tensor_info.GetElementType(); - nvinfer1::Dims dims = trt_engine->getTensorShape(input_name); - int nb_dims = dims.nbDims; - int shape_size = nb_dims == 0 ? 1 : static_cast(tensor_shapes[0]); // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension - shape_values.resize(shape_size, 1); - - switch (tensor_type) { - case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: { - auto input = std::make_unique(shape_size); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData(), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); - CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); - for (int j = 0; j < shape_size; ++j) { - shape_values[j] = input[j]; - } - break; - } - case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: { - auto input = std::make_unique(shape_size); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData(), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream)); - CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); - for (int j = 0; j < shape_size; ++j) { - shape_values[j] = static_cast(input[j]); - } - break; - } - default: { - return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, - "TensorRT shape tensor data type: " + std::to_string(tensor_type) + " not supported."); - } - } + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(shape_values, + input_tensor.GetTensorData(), + shape_size * sizeof(T), + cudaMemcpyDeviceToHost, + stream)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); return Status::OK(); } @@ -556,13 +530,16 @@ bool ApplyProfileShapesFromProviderOptions(std::vector shape values" for the INT32 shape tensor input across this inference run + * @param shape_tensor_values_int64 holds "shape tensor -> shape values" for the INT64 shape tensor input across this inference run */ Status ApplyProfileShapesFromInputTensorValue(std::vector& trt_profiles, Ort::KernelContext ctx, nvinfer1::ITensor* input, ShapeRangesMap& shape_ranges, const std::unordered_map& input_indexes, - std::unordered_map>& tensor_shape_values, + std::unordered_map>& shape_tensor_values, + std::unordered_map>& shape_tensor_values_int64, cudaStream_t stream, bool* engine_update) { for (size_t i = 0; i < trt_profiles.size(); i++) { @@ -615,26 +592,35 @@ Status ApplyProfileShapesFromInputTensorValue(std::vectorisShapeTensor()) { // Get shape values for shape tensor input const auto tensor_type = tensor_info.GetElementType(); - int shape_size = nb_dims == 0 ? 1 : static_cast(tensor_shapes[0]); // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension - tensor_shape_values[input_name].resize(shape_size); + // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension + int shape_size = dims.nbDims == 0 ? 1 : static_cast(tensor_shapes[0]); + // For setting TRT optimization profile. (Note: the min/opt/max profile values are still int32 even though int64 is supported after TRT 10) + std::vector values(shape_size); + switch (tensor_type) { case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: { - auto input_shape = std::make_unique(shape_size); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_shape.get(), input_tensor.GetTensorData(), - shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); - CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); + auto buffer = std::make_unique(shape_size); + auto status = GetShapeOfShapeTensor(input_tensor, buffer.get(), shape_size, stream); + if (status != Status::OK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); + } + shape_tensor_values[input_name].resize(shape_size); for (int j = 0; j < shape_size; ++j) { - tensor_shape_values[input_name][j] = input_shape[j]; + shape_tensor_values[input_name][j] = buffer[j]; + values[j] = buffer[j]; } break; } case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: { - auto input_shape = std::make_unique(shape_size); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_shape.get(), input_tensor.GetTensorData(), - shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream)); - CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); + auto buffer = std::make_unique(shape_size); + auto status = GetShapeOfShapeTensor(input_tensor, buffer.get(), shape_size, stream); + if (status != Status::OK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); + } + shape_tensor_values_int64[input_name].resize(shape_size); for (int j = 0; j < shape_size; ++j) { - tensor_shape_values[input_name][j] = static_cast(input_shape[j]); + shape_tensor_values_int64[input_name][j] = buffer[j]; + values[j] = static_cast(buffer[j]); } break; } @@ -655,7 +641,7 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector(shape_range[1]); shapes_opt[j] = static_cast(shape_range[2]); - const auto& tensor_shape_value = tensor_shape_values[input_name][j]; + const auto& tensor_shape_value = values[j]; // Update shape range lower bound if (tensor_shape_value < shape_range[0]) { shape_range[0] = tensor_shape_value; @@ -675,7 +661,7 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector> profile_vector; std::vector shape_vector{tensor_shape_value, tensor_shape_value, tensor_shape_value}; profile_vector.push_back(shape_vector); // only one profile needed @@ -802,13 +788,16 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector shape values" for the INT32 shape tensor input across this inference run + * @param shape_tensor_values_int64 holds "shape tensor -> shape values" for the INT64 shape tensor input across this inference run */ Status BindContextInput(Ort::KernelContext& ctx, nvinfer1::ICudaEngine* trt_engine, nvinfer1::IExecutionContext* trt_context, const char* input_name, size_t input_index, - std::vector& shape_values, // only for "shape tensor" + std::unordered_map>& shape_tensor_values, + std::unordered_map>& shape_tensor_values_int64, std::vector>& scratch_buffers, OrtAllocator* alloc, cudaStream_t stream) { @@ -829,19 +818,62 @@ Status BindContextInput(Ort::KernelContext& ctx, const auto elem_cnt = tensor_info.GetElementCount(); if (trt_engine->isShapeInferenceIO(input_name)) { - // Get the shape value of "shape tensor" - if (shape_values.empty()) { - auto status = GetShapeOfShapeTensor(input_tensor, shape_values, trt_engine, input_name, stream); - if (status != Status::OK()) { - return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); - } - } - // Bind "shape tensor" input buffer - if (!trt_context->setTensorAddress(input_name, &shape_values[0])) { - std::string error_input_name = input_name; - ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, - "TensorRT EP failed to call nvinfer1::IExecutionContext::setTensorAddress() for shape input '" + error_input_name + "'")); + + // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension + int shape_size = trt_engine->getTensorShape(input_name).nbDims == 0 ? 1 : static_cast(tensor_shapes[0]); + switch (tensor_type) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: { + // get shape tensor value if not present + if (shape_tensor_values.find(input_name) == shape_tensor_values.end()) { + auto input = std::make_unique(shape_size); + auto status = GetShapeOfShapeTensor(input_tensor, input.get(), shape_size, stream); + if (status != Status::OK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); + } + shape_tensor_values[input_name].resize(shape_size); + for (int i = 0; i < shape_size; ++i) { + shape_tensor_values[input_name][i] = input[i]; + } + } + + if (!trt_context->setTensorAddress(input_name, &shape_tensor_values[input_name][0])) { + std::string error_input_name = input_name; + std::string error_msg = + "TensorRT EP failed to call nvinfer1::IExecutionContext::setTensorAddress() for shape input '" + + error_input_name + "'"; + ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, error_msg)); + } + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: { + // get shape tensor value if not present + if (shape_tensor_values_int64.find(input_name) == shape_tensor_values_int64.end()) { + auto input = std::make_unique(shape_size); + auto status = GetShapeOfShapeTensor(input_tensor, input.get(), shape_size, stream); + if (status != Status::OK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); + } + shape_tensor_values_int64[input_name].resize(shape_size); + for (int i = 0; i < shape_size; ++i) { + shape_tensor_values_int64[input_name][i] = input[i]; + } + } + + if (!trt_context->setTensorAddress(input_name, &shape_tensor_values_int64[input_name][0])) { + std::string error_input_name = input_name; + std::string error_msg = + "TensorRT EP failed to call nvinfer1::IExecutionContext::setTensorAddress() for shape input '" + + error_input_name + "'"; + ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, error_msg)); + } + break; + } + default: { + std::string error_input_name = input_name; + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "The data type of shape tensor should be INT32 or INT64. Please check the data type of " + error_input_name); + } } } else { // Set shape for input tensor which is execution tensor @@ -869,8 +901,12 @@ Status BindContextInput(Ort::KernelContext& ctx, CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t) CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t) CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t) - // Cast int64 input to int32 input because TensorRT doesn't support int64 +#if NV_TENSORRT_MAJOR >= 10 + CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t) +#else + // Cast int64 input to int32 input because TensorRT < 10 doesn't support int64 CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t, int32_t) +#endif // Cast double input to float because TensorRT doesn't support double CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float) default: { @@ -957,8 +993,12 @@ Status BindContextOutput(Ort::KernelContext& ctx, CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t) CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t) CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t) - // Allocate int32 CUDA memory for int64 output type because TensorRT doesn't support int64 +#if NV_TENSORRT_MAJOR >= 10 + CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t) +#else + // Allocate int32 CUDA memory for int64 output type because TensorRT < 10 doesn't support int64 CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t, int32_t) +#endif // Allocate float CUDA memory for double output type because TensorRT doesn't support double CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float) default: { @@ -3043,7 +3083,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView const std::unordered_map& output_indexes = (trt_state->output_info)[0]; const std::unordered_map& output_types = (trt_state->output_info)[1]; auto fused_node_name = trt_state->fused_node_name; + // This map "shape_ranges" contains the shape range info for setting TRT optimization profiles. + // The info is used for both shape tensor and execution tensor: + // tensor name->(dimension->[min, max, opt]) auto& shape_ranges = trt_state->input_shape_ranges; + std::unordered_map> shape_tensor_values; // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run + std::unordered_map> shape_tensor_values_int64; // same as above but for int64 shape tensor input auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name]; auto trt_builder = trt_state->builder; auto trt_engine = trt_state->engine->get(); @@ -3055,7 +3100,6 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView bool engine_update = false; bool context_update = false; std::unordered_set input_names; - std::unordered_map> tensor_shape_values; OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow(device_id_)); OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_); @@ -3153,7 +3197,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved. // TRT EP will help determine the min/max/opt profile values based on current input tensor value. if (shape_ranges.find(input_name) != shape_ranges.end()) { - auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update); + auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, shape_tensor_values, shape_tensor_values_int64, stream, &engine_update); if (status != Status::OK()) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles."); } @@ -3366,13 +3410,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo(); const auto tensor_shapes = tensor_info.GetShape(); - // Only use for "shape tensor" input - std::vector shape_values; - if (tensor_shape_values.find(input_name) != tensor_shape_values.end()) { - shape_values = tensor_shape_values[input_name]; - } - - auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream); + auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_tensor_values, shape_tensor_values_int64, scratch_buffers, alloc, stream); if (status != Status::OK()) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); } @@ -3476,12 +3514,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView } } else { auto& output_tensor = output_tensors[i]; +#if NV_TENSORRT_MAJOR < 10 if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) { auto output_tensor_ptr = output_tensor.GetTensorMutableData(); if (output_tensor_ptr != nullptr) { cuda::Impl_Cast(stream, reinterpret_cast(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]); } - } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { + } +#endif + if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { auto output_tensor_ptr = output_tensor.GetTensorMutableData(); if (output_tensor_ptr != nullptr) { cuda::Impl_Cast(stream, reinterpret_cast(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]); @@ -3622,8 +3663,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con auto trt_engine = trt_state->engine->get(); auto trt_context = trt_state->context->get(); auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr; - // int num_inputs = static_cast(input_indexes.size()); int num_outputs = static_cast(output_indexes.size()); + std::unordered_map> shape_tensor_values; // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run + std::unordered_map> shape_tensor_values_int64; // same as above but for int64 shape tensor input OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow(device_id_)); OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_); @@ -3662,10 +3704,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con input_index = iter->second; } - // Only use for "shape tensor" input - std::vector shape_values; - - Status status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream); + Status status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_tensor_values, shape_tensor_values_int64, scratch_buffers, alloc, stream); if (status != Status::OK()) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); } @@ -3769,12 +3808,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con } } else { auto& output_tensor = output_tensors[i]; +#if NV_TENSORRT_MAJOR < 10 if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) { auto output_tensor_ptr = output_tensor.GetTensorMutableData(); if (output_tensor_ptr != nullptr) { cuda::Impl_Cast(stream, reinterpret_cast(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]); } - } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { + } +#endif + if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { auto output_tensor_ptr = output_tensor.GetTensorMutableData(); if (output_tensor_ptr != nullptr) { cuda::Impl_Cast(stream, reinterpret_cast(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]); diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 7ee0527b0b..eabbbdea1c 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -140,6 +140,10 @@ class OutputAllocator : public nvinfer1::IOutputAllocator { std::vector output_shapes; }; +/* + * This map saves the dimension range of the shape of the shape tensor or execution tensor: + * tensor name -> ( dimension -> [min, max, opt] ) + */ using ShapeRangesMap = std::unordered_map>>>; // Information to construct kernel function state.