From 7be1d4aad3f984ebe2c4fb0f7db0b9ca67cc8964 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Mon, 1 Jul 2024 22:55:20 -0700 Subject: [PATCH] [TensorRT EP] Update TRT10.0 deprecated api (#20989) ### Description Note: * This PR would remove C4996 suppression in tensorrt_execution_provider.cc only (according to Nvidia, places with nvinfer.h included need C4996 suppression, when /Zc:__cplusplus is enabled in ORT win build) * A follow-up PR will be raised to update deprecated TRT Plugin api usage. Here are deprecated apis to be updated in this PR: | deprecated api | Update | | ------------------------------------------------------------ | ------------------------------------------------------------ | | [kCUBLAS](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/namespacenvinfer1.html#a9e1d81e5a8bfeb38b86e22a66d5f836a) | / | | [kCUBLAS_LT](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/namespacenvinfer1.html#a9e1d81e5a8bfeb38b86e22a66d5f836a) | / | | [kCUDNN](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/namespacenvinfer1.html#a9e1d81e5a8bfeb38b86e22a66d5f836a) | / | | [reallocateOutput](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1v__1__0_1_1_i_output_allocator.html#acae6441d4029584cc1c6550917518691) | Superseded by [reallocateOutputAsync](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1v__1__0_1_1_i_output_allocator.html#aa40eeb891c1dfe4c1bbf1eabe8c705ab) with cudaStream_t argument | | [createExecutionContextWithoutDeviceMemory](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_cuda_engine.html#adc86bcc42b098204997396ef2b1093fb) | Superseded by [createExecutionContext()](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_cuda_engine.html#a35de29aa6134165a5b14a537e6d99e82) with parameter.
Check [ExecutionContextAllocationStrategy::kUSER_MANAGED](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/namespacenvinfer1.html#ac6251a050df629edfc0ce037fa366503) for more detail | ### Motivation and Context TRT deprecated api list: https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/deprecated.html --- .../tensorrt/tensorrt_execution_provider.cc | 58 +++++++++++++------ .../tensorrt/tensorrt_execution_provider.h | 5 +- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 3ca0935b9e..be924d6a68 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -169,11 +169,20 @@ nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_string) { nvinfer1::TacticSource source{}; t = toUpper(t); if (t == "CUBLAS") { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Tactic kCUBLAS is deprecated in TensorRT 10.0"; +#if NV_TENSORRT_MAJOR < 10 source = nvinfer1::TacticSource::kCUBLAS; +#endif } else if (t == "CUBLASLT" || t == "CUBLAS_LT") { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Tactic kCUBLAS_LT is deprecated in TensorRT 9.0"; +#if NV_TENSORRT_MAJOR < 9 source = nvinfer1::TacticSource::kCUBLAS_LT; +#endif } else if (t == "CUDNN") { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Tactic kCUDNN is deprecated in TensorRT 10.0"; +#if NV_TENSORRT_MAJOR < 10 source = nvinfer1::TacticSource::kCUDNN; +#endif } else if (t == "EDGE_MASK_CONVOLUTIONS") { source = nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS; } else if (t == "JIT_CONVOLUTIONS") { @@ -298,6 +307,25 @@ void CudaCall(cudnnStatus_t retCode, const char* exprString return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg, file, line); } +#if NV_TENSORRT_MAJOR >= 10 +void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size, + uint64_t /*alignment*/, cudaStream_t /*stream*/) noexcept { + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + size = std::max(size, static_cast(1)); + if (size > allocated_size) { + cudaFree(outputPtr); + outputPtr = nullptr; + allocated_size = 0; + if (cudaMalloc(&outputPtr, size) == cudaSuccess) { + allocated_size = size; + } + } + // if cudaMalloc fails, returns nullptr. + return outputPtr; +} +#else +// Only override this method when TensorRT <= 8.6 void* OutputAllocator::reallocateOutput(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size, uint64_t /*alignment*/) noexcept { // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr @@ -314,6 +342,7 @@ void* OutputAllocator::reallocateOutput(char const* /*tensorName*/, void* /*curr // if cudaMalloc fails, returns nullptr. return outputPtr; } +#endif void OutputAllocator::notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept { output_shapes.clear(); @@ -3152,14 +3181,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView if (mem_size > max_ctx_mem_size_) { max_ctx_mem_size_ = mem_size; } - -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4996) // nvinfer1::ICudaEngine::createExecutionContextWithoutDeviceMemory was deprecated -#endif +#if NV_TENSORRT_MAJOR < 10 trt_context = std::unique_ptr(trt_engine->createExecutionContextWithoutDeviceMemory()); -#if defined(_MSC_VER) -#pragma warning(pop) +#else + trt_context = std::unique_ptr(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); #endif } else { trt_context = std::unique_ptr(trt_engine->createExecutionContext()); @@ -3606,14 +3631,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView if (context_update) { if (trt_state->context_memory_sharing_enable) { -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4996) // nvinfer1::ICudaEngine::createExecutionContextWithoutDeviceMemory was deprecated -#endif +#if NV_TENSORRT_MAJOR < 10 *(trt_state->context) = std::unique_ptr( trt_state->engine->get()->createExecutionContextWithoutDeviceMemory()); -#if defined(_MSC_VER) -#pragma warning(pop) +#else + *(trt_state->context) = std::unique_ptr( + trt_state->engine->get()->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); #endif } else { *(trt_state->context) = std::unique_ptr( @@ -3830,13 +3853,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con if (mem_size > max_ctx_mem_size_) { max_ctx_mem_size_ = mem_size; } -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4996) // nvinfer1::ICudaEngine::createExecutionContextWithoutDeviceMemory was deprecated -#endif +#if NV_TENSORRT_MAJOR < 10 trt_context = std::unique_ptr(trt_engine->createExecutionContextWithoutDeviceMemory()); -#if defined(_MSC_VER) -#pragma warning(pop) +#else + trt_context = std::unique_ptr(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); #endif } else { trt_context = std::unique_ptr(trt_engine->createExecutionContext()); diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index f4dae57487..ec14057956 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -116,8 +116,11 @@ using unique_pointer = std::unique_ptr; // class OutputAllocator : public nvinfer1::IOutputAllocator { public: +#if NV_TENSORRT_MAJOR >= 10 + void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, cudaStream_t stream) noexcept override; +#else void* reallocateOutput(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override; - +#endif void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override; void* getBuffer() {