From 8fa427b264e00fb9e8a2f2b03ca78e808f0d0609 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Mon, 14 Sep 2020 22:48:00 -0700 Subject: [PATCH] Ryanunderhill/backout 5014 (#5167) * Revert 5014 --- cmake/onnxruntime_framework.cmake | 5 + cmake/onnxruntime_providers.cmake | 16 +- onnxruntime/core/framework/data_transfer.h | 3 - .../core/framework/provider_bridge_ort.cc | 71 +++++---- .../providers/dnnl/dnnl_execution_provider.cc | 8 +- .../providers/shared_library/provider_api.h | 11 ++ .../provider_bridge_provider.cc | 14 ++ .../shared_library/provider_interfaces.h | 47 +++--- onnxruntime/core/providers/tensorrt/cuda.cc | 146 ------------------ .../core/providers/tensorrt/cuda_allocator.cc | 70 --------- .../core/providers/tensorrt/cuda_allocator.h | 36 ----- .../core/providers/tensorrt/cuda_fence.cc | 68 -------- .../core/providers/tensorrt/cuda_fence.h | 25 --- .../providers/tensorrt/gpu_data_transfer.cc | 68 -------- .../providers/tensorrt/gpu_data_transfer.h | 35 ----- .../tensorrt/tensorrt_execution_provider.cc | 52 +++++-- .../tensorrt/tensorrt_execution_provider.h | 8 - 17 files changed, 142 insertions(+), 541 deletions(-) delete mode 100644 onnxruntime/core/providers/tensorrt/cuda.cc delete mode 100644 onnxruntime/core/providers/tensorrt/cuda_allocator.cc delete mode 100644 onnxruntime/core/providers/tensorrt/cuda_allocator.h delete mode 100644 onnxruntime/core/providers/tensorrt/cuda_fence.cc delete mode 100644 onnxruntime/core/providers/tensorrt/cuda_fence.h delete mode 100644 onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc delete mode 100644 onnxruntime/core/providers/tensorrt/gpu_data_transfer.h diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake index 0afd668d23..b97cc0ed48 100644 --- a/cmake/onnxruntime_framework.cmake +++ b/cmake/onnxruntime_framework.cmake @@ -24,7 +24,12 @@ add_library(onnxruntime_framework ${onnxruntime_framework_srcs}) if(onnxruntime_ENABLE_INSTRUMENT) target_compile_definitions(onnxruntime_framework PRIVATE ONNXRUNTIME_ENABLE_INSTRUMENT) endif() +if(onnxruntime_USE_TENSORRT) +# TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP +target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +else() target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) +endif() onnxruntime_add_include_to_target(onnxruntime_framework onnxruntime_common onnx onnx_proto protobuf::libprotobuf flatbuffers) set_target_properties(onnxruntime_framework PROPERTIES FOLDER "ONNXRuntime") # need onnx to build to create headers that this project includes diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index 7f661a1965..4c1a74d273 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -311,7 +311,6 @@ if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_DNNL) if(APPLE) set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst") elseif(UNIX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations") set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections") elseif(WIN32) set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def") @@ -338,7 +337,6 @@ if (onnxruntime_USE_DNNL) add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES}) target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR}) # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found - target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl onnxruntime_providers_shared) install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dnnl DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers) set_target_properties(onnxruntime_providers_dnnl PROPERTIES FOLDER "ONNXRuntime") @@ -376,6 +374,7 @@ if (onnxruntime_USE_TENSORRT) endif() set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -include algorithm") set(PROTOBUF_LIBRARY libprotobuf) + set(DISABLED_WARNINGS_FOR_TRT /wd4267 /wd4244 /wd4996 /wd4456) endif() if ( CMAKE_COMPILER_IS_GNUCC ) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-missing-field-initializers") @@ -406,11 +405,6 @@ if (onnxruntime_USE_TENSORRT) file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.h" "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.cc" - "${ONNXRUNTIME_ROOT}/core/providers/cuda/cu_inc/common.cuh" - "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cuh" - "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops.h" - "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu" - "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.h" "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h" "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc" ) @@ -424,15 +418,17 @@ if (onnxruntime_USE_TENSORRT) else() target_link_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDA_HOME}/lib64) endif() - target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf flatbuffers) + target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf) target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found - install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tensorrt DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers) - set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CUDA) + set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CXX) set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime") target_compile_definitions(onnxruntime_providers_tensorrt PRIVATE ONNXIFI_BUILD_LIBRARY=1) target_compile_options(onnxruntime_providers_tensorrt PRIVATE ${DISABLED_WARNINGS_FOR_TRT}) + if (WIN32) + target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4267 /wd4244 /wd4996 /wd4456) + endif() if(APPLE) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/tensorrt/exported_symbols.lst") diff --git a/onnxruntime/core/framework/data_transfer.h b/onnxruntime/core/framework/data_transfer.h index d2e4cf5f2a..e2093f0abf 100644 --- a/onnxruntime/core/framework/data_transfer.h +++ b/onnxruntime/core/framework/data_transfer.h @@ -26,9 +26,6 @@ class IDataTransfer { // batched copy. default implementation copies each entry sequentially, and returns on first failure. virtual common::Status CopyTensors(const std::vector& src_dst_pairs) const; - - // If this is really a Provider_IDataTransfer, this returns true. Used to convert back & forth with providers efficiently - virtual bool IsProviderInterface() const { return false; } }; class CPUDataTransfer : public IDataTransfer { diff --git a/onnxruntime/core/framework/provider_bridge_ort.cc b/onnxruntime/core/framework/provider_bridge_ort.cc index 114b32dbc9..425d24b658 100644 --- a/onnxruntime/core/framework/provider_bridge_ort.cc +++ b/onnxruntime/core/framework/provider_bridge_ort.cc @@ -15,6 +15,15 @@ #include "core/session/inference_session.h" #include "core/session/abi_session_options_impl.h" #include "core/session/ort_apis.h" + +#ifdef USE_TENSORRT +#include "core/providers/tensorrt/tensorrt_provider_factory.h" +#include "core/providers/cuda/cuda_allocator.h" +#include "core/providers/cuda/gpu_data_transfer.h" +#include "core/providers/cuda/math/unary_elementwise_ops_impl.h" +#include "core/providers/cuda/cuda_common.h" +#endif + #define PROVIDER_BRIDGE_ORT #include "core/providers/shared_library/provider_interfaces.h" #include "onnx/common/stl_backports.h" @@ -60,8 +69,6 @@ struct Provider_IAllocator_Impl : Provider_IAllocator { void* Alloc(size_t size) override { return p_->Alloc(size); } void Free(void* p) override { return p_->Free(p); } - FencePtr CreateFence(const Provider_SessionState* session_state) override { return p_->CreateFence(reinterpret_cast(session_state)); } - bool IsProviderInterface() const override { return false; } std::unique_ptr p_; @@ -74,22 +81,9 @@ struct ProviderAllocator : IAllocator { void* Alloc(size_t size) override { return p_->Alloc(size); } void Free(void* p) override { return p_->Free(p); } - FencePtr CreateFence(const SessionState* session_state) override { return p_->CreateFence(reinterpret_cast(session_state)); } - std::shared_ptr p_; }; -struct IDataTransfer_Wrapper : IDataTransfer { - IDataTransfer_Wrapper(std::unique_ptr p) : p_{std::move(p)} {} - - bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override { return p_->CanCopy(src_device, dst_device); } - common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override { return p_->CopyTensor(*reinterpret_cast(&src), *reinterpret_cast(&dst), exec_queue_id); } - - bool IsProviderInterface() const override { return true; } - - std::unique_ptr p_; -}; - struct Provider_TensorShapeProto_Dimension_Iterator_Impl : Provider_TensorShapeProto_Dimension_Iterator { Provider_TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator&& v) : v_{std::move(v)} {} @@ -210,11 +204,7 @@ struct Provider_IExecutionProvider_Router_Impl : Provider_IExecutionProvider_Rou } std::unique_ptr GetDataTransfer() const override { - auto internal = outer_->Provider_GetDataTransfer(); - if (internal) - return onnxruntime::make_unique(std::move(internal)); - else - return nullptr; + return std::unique_ptr(reinterpret_cast(outer_->Provider_GetDataTransfer().release())); } void Provider_InsertAllocator(Provider_AllocatorPtr allocator) override { @@ -237,8 +227,8 @@ struct ProviderHostImpl : ProviderHost { AllocatorCreationInfo info_real{ [&info](int value) -> std::unique_ptr { auto allocator = info.factory(value); - // If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IDeviceAllocator - // Otherwise it's really a Provider_IDeviceAllocator_Impl, so we can just unwrap it to get back to the IDeviceAllocator inside + // If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IAllocator + // Otherwise it's really a Provider_IAllocator_Impl, so we can just unwrap it to get back to the IAllocator inside if (allocator->IsProviderInterface()) return onnxruntime::make_unique(std::move(allocator)); else @@ -264,6 +254,31 @@ struct ProviderHostImpl : ProviderHost { return onnxruntime::make_unique(outer, type); }; +#ifdef USE_TENSORRT + std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name) override { + return onnxruntime::make_unique(onnxruntime::make_unique(device_id, name)); + } + + std::unique_ptr CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override { + return onnxruntime::make_unique(onnxruntime::make_unique(device_id, name)); + } + + std::unique_ptr CreateGPUDataTransfer() override { + return std::unique_ptr(reinterpret_cast(new GPUDataTransfer())); + } + + void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override { + return cuda::Impl_Cast(input_data, output_data, count); + } + + void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override { + return cuda::Impl_Cast(input_data, output_data, count); + } + + bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg); } + bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg); } +#endif + std::string GetEnvironmentVar(const std::string& var_name) override { return Env::Default().GetEnvironmentVar(var_name); } @@ -379,12 +394,9 @@ struct ProviderHostImpl : ProviderHost { // Provider_DataTransferManager Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) override { return reinterpret_cast(p)->CopyTensor(*reinterpret_cast(&src), *reinterpret_cast(&dst), exec_queue_id); } - const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) override { - auto* data_transfer = reinterpret_cast(p)->GetDataTransfer(src_device, dst_device); - if (data_transfer->IsProviderInterface()) - return reinterpret_cast(data_transfer)->p_.get(); - return nullptr; - } + + // Provider_IDataTransfer + void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) override { delete reinterpret_cast(p); } // Provider_IndexedSubGraph_MetaDef std::unique_ptr Provider_IndexedSubGraph_MetaDef__construct() override { return std::unique_ptr(reinterpret_cast(new IndexedSubGraph::MetaDef())); } @@ -585,9 +597,6 @@ struct ProviderHostImpl : ProviderHost { const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept override { return *reinterpret_cast(&reinterpret_cast(p)->GetDataTransferManager()); } int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept override { return reinterpret_cast(p)->GetKernelDef().ExecQueueId(); } - // Provider_SessionState - const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) override { return *reinterpret_cast(&reinterpret_cast(p)->GetDataTransferMgr()); } - // Provider_Tensor float* Provider_Tensor__MutableData_float(Provider_Tensor* p) override { return reinterpret_cast(p)->MutableData(); } const float* Provider_Tensor__Data_float(const Provider_Tensor* p) override { return reinterpret_cast(p)->Data(); } diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc index e2d487a940..b03c05ad65 100644 --- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc +++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc @@ -29,16 +29,16 @@ constexpr const char* DNNL_CPU = "DnnlCpu"; DNNLExecutionProvider::DNNLExecutionProvider(const DNNLExecutionProviderInfo& info) : Provider_IExecutionProvider{onnxruntime::kDnnlExecutionProvider} { Provider_AllocatorCreationInfo default_memory_info( - [](int) { + {[](int) { return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL, OrtAllocatorType::OrtDeviceAllocator)); - }, + }}, 0, info.create_arena); Provider_AllocatorCreationInfo cpu_memory_info( - [](int) { + {[](int) { return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput)); - }, + }}, 0, info.create_arena); Provider_InsertAllocator(CreateAllocator(default_memory_info)); diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index bdde16976b..1f7d304386 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -88,6 +88,13 @@ constexpr const char* kMSDomain = "com.microsoft"; constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider"; constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider"; +enum CUDAStreamType : int { + kCudaStreamDefault = 0, + kCudaStreamCopyIn, + kCudaStreamCopyOut, + kTotalCudaStreams, +}; + class DataTypeImpl { public: virtual ~DataTypeImpl() = default; @@ -104,8 +111,12 @@ template using IAllocatorUniquePtr = std::unique_ptr>; std::unique_ptr Provider_CreateCPUAllocator(const OrtMemoryInfo& memory_info); +std::unique_ptr Provider_CreateCUDAAllocator(int16_t device_id, const char* name); +std::unique_ptr Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name); Provider_AllocatorPtr CreateAllocator(const Provider_AllocatorCreationInfo& info); +std::unique_ptr Provider_CreateGPUDataTransfer(); + std::string GetEnvironmentVar(const std::string& var_name); class CPUIDInfo { diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index 326ea49263..81405f2af0 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -146,6 +146,20 @@ std::unique_ptr Provider_CreateCPUAllocator(const OrtMemory return g_host->CreateCPUAllocator(info); } +#ifdef USE_TENSORRT +std::unique_ptr Provider_CreateCUDAAllocator(int16_t device_id, const char* name) { + return g_host->CreateCUDAAllocator(device_id, name); +} + +std::unique_ptr Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name) { + return g_host->CreateCUDAPinnedAllocator(device_id, name); +} + +std::unique_ptr Provider_CreateGPUDataTransfer() { + return g_host->CreateGPUDataTransfer(); +} +#endif + std::string GetEnvironmentVar(const std::string& var_name) { return g_host->GetEnvironmentVar(var_name); } diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 02a9fa82ef..62009f2cb9 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -5,7 +5,6 @@ // In the future the internal implementations could derive from these to remove the need for the wrapper implementations #include "core/framework/func_api.h" -#include "core/framework/fence.h" // For FencePtr #define PROVIDER_DISALLOW_ALL(TypeName) \ TypeName() = delete; \ @@ -63,7 +62,6 @@ struct Provider_NodeAttributes; struct Provider_OpKernel_Base; struct Provider_OpKernelContext; struct Provider_OpKernelInfo; -struct Provider_SessionState; struct Provider_Tensor; class TensorShape; @@ -119,8 +117,6 @@ struct Provider_IAllocator { virtual void Free(void* p) = 0; const OrtMemoryInfo& Info() const { return memory_info_; }; - virtual FencePtr CreateFence(const Provider_SessionState* /*session_state*/) { return nullptr; } - virtual bool IsProviderInterface() const { return true; } template @@ -259,17 +255,6 @@ struct Provider_IExecutionProvider { void operator=(const Provider_IExecutionProvider&) = delete; }; -struct Provider_IDataTransfer { - Provider_IDataTransfer() = default; - virtual ~Provider_IDataTransfer() {} - - virtual bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const = 0; - virtual common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const = 0; - - Provider_IDataTransfer(const Provider_IDataTransfer&) = delete; - void operator=(const Provider_IDataTransfer&) = delete; -}; - struct Provider { virtual std::shared_ptr CreateExecutionProviderFactory(int device_id) = 0; }; @@ -286,6 +271,18 @@ struct ProviderHost { virtual std::unique_ptr CreateCPUAllocator(const OrtMemoryInfo& memory_info) = 0; +#ifdef USE_TENSORRT + virtual std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name) = 0; + virtual std::unique_ptr CreateCUDAPinnedAllocator(int16_t device_id, const char* name) = 0; + virtual std::unique_ptr CreateGPUDataTransfer() = 0; + + virtual void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) = 0; + virtual void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) = 0; + + virtual bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0; + virtual bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0; +#endif + virtual std::unique_ptr Create_IExecutionProvider_Router(Provider_IExecutionProvider* outer, const std::string& type) = 0; virtual std::string GetEnvironmentVar(const std::string& var_name) = 0; @@ -384,7 +381,9 @@ struct ProviderHost { // Provider_DataTransferManager virtual Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) = 0; - virtual const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) = 0; + + // Provider_IDataTransfer + virtual void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) = 0; // Provider_IndexedSubGraph_MetaDef virtual std::unique_ptr Provider_IndexedSubGraph_MetaDef__construct() = 0; @@ -541,9 +540,6 @@ struct ProviderHost { virtual const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept = 0; virtual int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept = 0; - // Provider_SessionState - virtual const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) = 0; - // Provider_Tensor virtual float* Provider_Tensor__MutableData_float(Provider_Tensor* p) = 0; virtual const float* Provider_Tensor__Data_float(const Provider_Tensor* p) = 0; @@ -686,11 +682,18 @@ struct Provider_ComputeCapability { struct Provider_DataTransferManager { Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const { return g_host->Provider_DataTransferManager__CopyTensor(this, src, dst, exec_queue_id); } - const Provider_IDataTransfer* GetProviderDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const { return g_host->Provider_DataTransferManager__GetProviderDataTransfer(this, src_device, dst_device); } PROVIDER_DISALLOW_ALL(Provider_DataTransferManager) }; +struct Provider_IDataTransfer { + static void operator delete(void* p) { g_host->Provider_IDataTransfer__operator_delete(reinterpret_cast(p)); } + + Provider_IDataTransfer() = delete; + Provider_IDataTransfer(const Provider_IDataTransfer&) = delete; + void operator=(const Provider_IDataTransfer&) = delete; +}; + struct Provider_IndexedSubGraph_MetaDef { static std::unique_ptr Create() { return g_host->Provider_IndexedSubGraph_MetaDef__construct(); } static void operator delete(void* p) { g_host->Provider_IndexedSubGraph_MetaDef__operator_delete(reinterpret_cast(p)); } @@ -1015,10 +1018,6 @@ inline Status Provider_OpKernelInfo::GetAttr(const std::string& name, flo return GetAttr(name, value); } -struct Provider_SessionState { - const Provider_DataTransferManager& GetDataTransferManager() const { return g_host->Provider_SessionState__GetDataTransferManager(this); } -}; - struct Provider_Tensor { float* MutableData_float() { return g_host->Provider_Tensor__MutableData_float(this); } const float* Data_float() const { return g_host->Provider_Tensor__Data_float(this); } diff --git a/onnxruntime/core/providers/tensorrt/cuda.cc b/onnxruntime/core/providers/tensorrt/cuda.cc deleted file mode 100644 index 7854f8383c..0000000000 --- a/onnxruntime/core/providers/tensorrt/cuda.cc +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/shared_library/provider_api.h" -#include "core/providers/cuda/shared_inc/cuda_call.h" - -#ifdef _WIN32 -#else // POSIX -#include -#include -#endif - -namespace onnxruntime { - -using namespace common; - -template -const char* CudaErrString(ERRTYPE x) { - ORT_NOT_IMPLEMENTED(); -} - -#define CASE_ENUM_TO_STR(x) \ - case x: \ - return #x - -template <> -const char* CudaErrString(cudaError_t x) { - cudaDeviceSynchronize(); - return cudaGetErrorString(x); -} - -template <> -const char* CudaErrString(cublasStatus_t e) { - cudaDeviceSynchronize(); - - switch (e) { - CASE_ENUM_TO_STR(CUBLAS_STATUS_SUCCESS); - CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED); - CASE_ENUM_TO_STR(CUBLAS_STATUS_ALLOC_FAILED); - CASE_ENUM_TO_STR(CUBLAS_STATUS_INVALID_VALUE); - CASE_ENUM_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH); - CASE_ENUM_TO_STR(CUBLAS_STATUS_MAPPING_ERROR); - CASE_ENUM_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED); - CASE_ENUM_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR); - CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED); - CASE_ENUM_TO_STR(CUBLAS_STATUS_LICENSE_ERROR); - default: - return "(look for CUBLAS_STATUS_xxx in cublas_api.h)"; - } -} - -template <> -const char* CudaErrString(curandStatus) { - cudaDeviceSynchronize(); - return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; -} - -template <> -const char* CudaErrString(cudnnStatus_t e) { - cudaDeviceSynchronize(); - return cudnnGetErrorString(e); -} - -template <> -const char* CudaErrString(cufftResult e) { - cudaDeviceSynchronize(); - switch (e) { - CASE_ENUM_TO_STR(CUFFT_SUCCESS); - CASE_ENUM_TO_STR(CUFFT_ALLOC_FAILED); - CASE_ENUM_TO_STR(CUFFT_INVALID_VALUE); - CASE_ENUM_TO_STR(CUFFT_INTERNAL_ERROR); - CASE_ENUM_TO_STR(CUFFT_SETUP_FAILED); - CASE_ENUM_TO_STR(CUFFT_INVALID_SIZE); - default: - return "Unknown cufft error status"; - } -} - -#ifdef USE_NCCL -template <> -const char* CudaErrString(ncclResult_t e) { - cudaDeviceSynchronize(); - return ncclGetErrorString(e); -} -#endif - -template -bool CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg) { - if (retCode != successCode) { - try { -#ifdef _WIN32 - auto del = [](char* p) { free(p); }; - std::unique_ptr hostname_ptr(nullptr, del); - size_t hostname_len = 0; - char* hostname = nullptr; - if (-1 == _dupenv_s(&hostname, &hostname_len, "COMPUTERNAME")) - hostname = "?"; - else - hostname_ptr.reset(hostname); -#else - char hostname[HOST_NAME_MAX]; - if (gethostname(hostname, HOST_NAME_MAX) != 0) - strcpy(hostname, "?"); -#endif - int currentCudaDevice; - cudaGetDevice(¤tCudaDevice); - cudaGetLastError(); // clear last CUDA error - static char str[1024]; - snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; expr=%s; %s", - libName, (int)retCode, CudaErrString(retCode), currentCudaDevice, - hostname, - exprString, msg); - if (THRW) { - // throw an exception with the error info - ORT_THROW(str); - } else { - LOGS_DEFAULT(ERROR) << str; - } - } catch (const std::exception& e) { // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error - if (THRW) { - ORT_THROW(e.what()); - } else { - LOGS_DEFAULT(ERROR) << e.what(); - } - } - return false; - } - return true; -} - -template bool CudaCall(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg); -template bool CudaCall(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg); -template bool CudaCall(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg); -template bool CudaCall(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg); -template bool CudaCall(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg); -template bool CudaCall(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg); -template bool CudaCall(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg); -template bool CudaCall(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg); -template bool CudaCall(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg); -template bool CudaCall(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg); - -#ifdef USE_NCCL -template bool CudaCall(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg); -#endif - -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/cuda_allocator.cc b/onnxruntime/core/providers/tensorrt/cuda_allocator.cc deleted file mode 100644 index c891a49ace..0000000000 --- a/onnxruntime/core/providers/tensorrt/cuda_allocator.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/shared_library/provider_api.h" -#include "core/providers/cuda/shared_inc/cuda_call.h" - -#include "cuda_allocator.h" -#include "cuda_fence.h" -#include "gpu_data_transfer.h" - -namespace onnxruntime { - -static const GPUDataTransfer* GetGPUDataTransfer(const Provider_SessionState* session_state) { - OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0); - OrtDevice cpu_device; - return static_cast(session_state->GetDataTransferManager().GetProviderDataTransfer(gpu_device, cpu_device)); -} - -void CUDAAllocator::CheckDevice(bool throw_when_fail) const { -#ifndef NDEBUG - // check device to match at debug build - // if it's expected to change, call cudaSetDevice instead of the check - int current_device; - auto cuda_err = cudaGetDevice(¤t_device); - if (cuda_err == cudaSuccess) { - ORT_ENFORCE(current_device == Info().id); - } else if (throw_when_fail) { - CUDA_CALL_THROW(cuda_err); - } -#else - ORT_UNUSED_PARAMETER(throw_when_fail); -#endif -} - -void* CUDAAllocator::Alloc(size_t size) { - CheckDevice(true); - void* p = nullptr; - if (size > 0) { - //BFCArena was updated recently to handle the exception and adjust the request size - CUDA_CALL_THROW(cudaMalloc((void**)&p, size)); - } - return p; -} - -void CUDAAllocator::Free(void* p) { - CheckDevice(false); // ignore CUDA failure when free - cudaFree(p); // do not throw error since it's OK for cudaFree to fail during shutdown -} - -FencePtr CUDAAllocator::CreateFence(const Provider_SessionState* session_state) { - return std::make_shared(GetGPUDataTransfer(session_state)); -} - -void* CUDAPinnedAllocator::Alloc(size_t size) { - void* p = nullptr; - if (size > 0) { - CUDA_CALL_THROW(cudaMallocHost((void**)&p, size)); - } - return p; -} - -void CUDAPinnedAllocator::Free(void* p) { - CUDA_CALL_THROW(cudaFreeHost(p)); -} - -FencePtr CUDAPinnedAllocator::CreateFence(const Provider_SessionState* session_state) { - return std::make_shared(GetGPUDataTransfer(session_state)); -} - -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/cuda_allocator.h b/onnxruntime/core/providers/tensorrt/cuda_allocator.h deleted file mode 100644 index 0eea033135..0000000000 --- a/onnxruntime/core/providers/tensorrt/cuda_allocator.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -namespace onnxruntime { - -class CUDAAllocator : public Provider_IAllocator { - public: - CUDAAllocator(OrtDevice::DeviceId device_id, const char* name) - : Provider_IAllocator( - OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id), - device_id, OrtMemTypeDefault)) {} - void* Alloc(size_t size) override; - void Free(void* p) override; - FencePtr CreateFence(const Provider_SessionState* session_state) override; - - private: - void CheckDevice(bool throw_when_fail) const; -}; - -//TODO: add a default constructor -class CUDAPinnedAllocator : public Provider_IAllocator { - public: - CUDAPinnedAllocator(OrtDevice::DeviceId device_id, const char* name) - : Provider_IAllocator( - OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id), - device_id, OrtMemTypeCPUOutput)) {} - - void* Alloc(size_t size) override; - void Free(void* p) override; - FencePtr CreateFence(const Provider_SessionState* session_state) override; -}; -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/cuda_fence.cc b/onnxruntime/core/providers/tensorrt/cuda_fence.cc deleted file mode 100644 index 87eaf19289..0000000000 --- a/onnxruntime/core/providers/tensorrt/cuda_fence.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/shared_library/provider_api.h" -#include "core/providers/cuda/shared_inc/cuda_call.h" - -#include "cuda_allocator.h" -#include "cuda_fence.h" -#include "gpu_data_transfer.h" - -namespace onnxruntime { - -constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider"; - -CUDAFence::CUDAFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) { - // NOTE: cudaEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel - // if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU - int event_flags = /*cudaEventBlockingSync |*/ cudaEventDisableTiming; - CUDA_CALL_THROW(cudaEventCreate(&read_event_, event_flags)); - CUDA_CALL_THROW(cudaEventCreate(&write_event_, event_flags)); -} - -CUDAFence::~CUDAFence() { - CUDA_CALL_THROW(cudaEventDestroy(read_event_)); - CUDA_CALL_THROW(cudaEventDestroy(write_event_)); -} - -void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int async_queue_id) { - if (provider_type == onnxruntime::kCudaExecutionProvider) { - // sync in GPU, the call is non-blocking on CPU - CUDA_CALL_THROW(cudaStreamWaitEvent(data_transfer_->GetStream(async_queue_id), write_event_, 0)); - } else { - // sync on CPU for all other providers, this is blocking - CUDA_CALL_THROW(cudaEventSynchronize(write_event_)); - } -} - -void CUDAFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) { - if (provider_type == onnxruntime::kCudaExecutionProvider) { - // sync in GPU, the call is non-blocking on CPU - cudaStream_t stream = data_transfer_->GetStream(queue_id); - CUDA_CALL_THROW(cudaStreamWaitEvent(stream, read_event_, 0)); - CUDA_CALL_THROW(cudaStreamWaitEvent(stream, write_event_, 0)); - } else { - // sync on CPU for all other providers, this is blocking - CUDA_CALL_THROW(cudaEventSynchronize(read_event_)); - CUDA_CALL_THROW(cudaEventSynchronize(write_event_)); - } -} - -bool CUDAFence::CanRelease() { - return cudaEventQuery(read_event_) == cudaSuccess && - cudaEventQuery(write_event_) == cudaSuccess; -} - -void CUDAFence::AfterUsedAsInput(int queue_id) { - // update read fence - cudaStream_t stream = data_transfer_->GetStream(queue_id); - CUDA_CALL_THROW(cudaEventRecord(read_event_, stream)); -} - -void CUDAFence::AfterUsedAsOutput(int queue_id) { - // update write fence - cudaStream_t stream = data_transfer_->GetStream(queue_id); - CUDA_CALL_THROW(cudaEventRecord(write_event_, stream)); -} - -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/cuda_fence.h b/onnxruntime/core/providers/tensorrt/cuda_fence.h deleted file mode 100644 index efe50902e0..0000000000 --- a/onnxruntime/core/providers/tensorrt/cuda_fence.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -namespace onnxruntime { -class GPUDataTransfer; - -class CUDAFence : public IFence { - public: - CUDAFence(const GPUDataTransfer* data_transfer); - virtual ~CUDAFence(); - virtual void BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int queue_id) override; - virtual void BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) override; - virtual void AfterUsedAsInput(int queue_id) override; - virtual void AfterUsedAsOutput(int queue_id) override; - virtual bool CanRelease() override; - - private: - cudaEvent_t read_event_; - cudaEvent_t write_event_; - const GPUDataTransfer* data_transfer_; -}; - -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc b/onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc deleted file mode 100644 index cdc810616d..0000000000 --- a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/shared_library/provider_api.h" -#include "core/providers/cuda/shared_inc/cuda_call.h" - -#include "gpu_data_transfer.h" - -#define CUDA_RETURN_IF_ERROR(expr) \ - ORT_RETURN_IF_ERROR(CUDA_CALL(expr) \ - ? common::Status::OK() \ - : ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA error executing ", #expr)) - -namespace onnxruntime { -GPUDataTransfer::GPUDataTransfer() { - // create streams, default is nullptr - streams_[kCudaStreamDefault] = nullptr; - CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking)); - CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking)); -} - -GPUDataTransfer::~GPUDataTransfer() { - CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn])); - CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut])); -} - -bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { - return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED; -} - -common::Status GPUDataTransfer::CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const { - size_t bytes = src.SizeInBytes(); - const void* src_data = src.DataRaw(); - void* dst_data = dst.MutableDataRaw(); - - auto& src_device = src.Location().device; - auto& dst_device = dst.Location().device; - - if (dst_device.Type() == OrtDevice::GPU) { - if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { - // copy from pinned memory to GPU, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id])); - } else if (src_device.Type() == OrtDevice::GPU) { - // copying between GPU, this is non-blocking - // Copy only if the two addresses are different. - if (dst_data != src_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault])); - } - } else { - // copy from other CPU memory to GPU, this is blocking - CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice)); - } - } else if (src_device.Type() == OrtDevice::GPU) { - if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { - // copying from GPU to pinned memory, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id])); - } else { - // copying from GPU to CPU memory, this is blocking - CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost)); - } - } else { - // copying between cpu memory - memcpy(dst_data, src_data, bytes); - } - - return Status::OK(); -} -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.h b/onnxruntime/core/providers/tensorrt/gpu_data_transfer.h deleted file mode 100644 index 9b206664f4..0000000000 --- a/onnxruntime/core/providers/tensorrt/gpu_data_transfer.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -namespace onnxruntime { - -enum CUDAStreamType : int { - kCudaStreamDefault = 0, - kCudaStreamCopyIn, - kCudaStreamCopyOut, - kTotalCudaStreams, -}; - -class GPUDataTransfer : public Provider_IDataTransfer { - public: - GPUDataTransfer(); - ~GPUDataTransfer(); - - bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; - - // Suppress MSVC warning about not fully overriding - using Provider_IDataTransfer::CopyTensor; - common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const override; - - cudaStream_t GetStream(int queue_id) const { - ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams); - return streams_[queue_id]; - } - - private: - cudaStream_t streams_[kTotalCudaStreams]; -}; - -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 91582a7077..225a17703a 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -2,25 +2,25 @@ // Licensed under the MIT License. #include -#include #include -#include -#include #include -#include -#include #include "core/providers/shared_library/provider_api.h" #define ORT_API_MANUAL_INIT #include "core/session/onnxruntime_cxx_api.h" #include "core/common/safeint.h" #include "tensorrt_execution_provider.h" -#include "core/providers/cuda/math/unary_elementwise_ops_impl.h" +#include "core/providers/cuda/cuda_allocator.h" #include "core/providers/cuda/shared_inc/cuda_call.h" -#include "gpu_data_transfer.h" +#include "core/providers/cuda/math/unary_elementwise_ops_impl.h" +#include "cuda_runtime_api.h" #include "gsl/gsl" #include -#include "cuda_allocator.h" +#include +#include +#include +#include +#include #define CUDA_RETURN_IF_ERROR(expr) \ ORT_RETURN_IF_ERROR(CUDA_CALL(expr) \ @@ -69,6 +69,33 @@ struct ShutdownProtobuf { namespace onnxruntime { +namespace cuda { +template <> +void Impl_Cast( + const int64_t* input_data, int32_t* output_data, + size_t count) { + return g_host->cuda__Impl_Cast(input_data, output_data, count); +} + +template <> +void Impl_Cast( + const int32_t* input_data, int64_t* output_data, + size_t count) { + return g_host->cuda__Impl_Cast(input_data, output_data, count); +} + +} // namespace cuda + +template <> +bool CudaCall(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) { + return g_host->CudaCall_false(retCode, exprString, libName, successCode, msg); +} + +template <> +bool CudaCall(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) { + return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg); +} + constexpr const char* TRT = "Tensorrt"; constexpr const char* TRT_PINNED = "TensorrtPinned"; @@ -131,7 +158,7 @@ KernelRegistryAndStatus GetTensorrtKernelRegistry() { } std::shared_ptr TensorrtExecutionProvider::Provider_GetKernelRegistry() const { - static KernelRegistryAndStatus k = GetTensorrtKernelRegistry(); + static KernelRegistryAndStatus k = onnxruntime::GetTensorrtKernelRegistry(); // throw if the registry failed to initialize ORT_THROW_IF_ERROR(k.st); return k.kernel_registry; @@ -148,12 +175,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv CUDA_CALL_THROW(cudaSetDevice(device_id_)); Provider_AllocatorCreationInfo default_memory_info( - [](int id) { return onnxruntime::make_unique(id, TRT); }, device_id_); + [](int id) { return Provider_CreateCUDAAllocator(id, TRT); }, device_id_); allocator_ = CreateAllocator(default_memory_info); Provider_InsertAllocator(allocator_); Provider_AllocatorCreationInfo pinned_allocator_info( - [](int) { return onnxruntime::make_unique(0, TRT_PINNED); }, device_id_); + [](int) { return Provider_CreateCUDAPinnedAllocator(0, TRT_PINNED); }, device_id_); Provider_InsertAllocator(CreateAllocator(pinned_allocator_info)); // Get environment variables @@ -209,7 +236,7 @@ Provider_AllocatorPtr TensorrtExecutionProvider::Provider_GetAllocator(int id, O } std::unique_ptr TensorrtExecutionProvider::Provider_GetDataTransfer() const { - return onnxruntime::make_unique(); + return onnxruntime::Provider_CreateGPUDataTransfer(); } // Convert GraphViewer graph to GraphProto @@ -1345,5 +1372,4 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vector' to '', possible loss of data -#pragma warning(disable : 4996) // 'gmtime': This function or variable may be unsafe. Consider using gmtime_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details. -#pragma warning(disable : 4456) // declaration of 'i' hides previous local declaration -#endif - #include #include "NvInfer.h" #include "NvOnnxParser.h"