Ryanunderhill/backout 5014 (#5167)

* Revert 5014
This commit is contained in:
Ryan Hill 2020-09-14 22:48:00 -07:00 committed by GitHub
parent 089789c135
commit 8fa427b264
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 142 additions and 541 deletions

View file

@ -24,7 +24,12 @@ add_library(onnxruntime_framework ${onnxruntime_framework_srcs})
if(onnxruntime_ENABLE_INSTRUMENT)
target_compile_definitions(onnxruntime_framework PRIVATE ONNXRUNTIME_ENABLE_INSTRUMENT)
endif()
if(onnxruntime_USE_TENSORRT)
# TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP
target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
else()
target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
endif()
onnxruntime_add_include_to_target(onnxruntime_framework onnxruntime_common onnx onnx_proto protobuf::libprotobuf flatbuffers)
set_target_properties(onnxruntime_framework PROPERTIES FOLDER "ONNXRuntime")
# need onnx to build to create headers that this project includes

View file

@ -311,7 +311,6 @@ if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_DNNL)
if(APPLE)
set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
elseif(UNIX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
elseif(WIN32)
set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
@ -338,7 +337,6 @@ if (onnxruntime_USE_DNNL)
add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR})
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl onnxruntime_providers_shared)
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dnnl DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
set_target_properties(onnxruntime_providers_dnnl PROPERTIES FOLDER "ONNXRuntime")
@ -376,6 +374,7 @@ if (onnxruntime_USE_TENSORRT)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -include algorithm")
set(PROTOBUF_LIBRARY libprotobuf)
set(DISABLED_WARNINGS_FOR_TRT /wd4267 /wd4244 /wd4996 /wd4456)
endif()
if ( CMAKE_COMPILER_IS_GNUCC )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-missing-field-initializers")
@ -406,11 +405,6 @@ if (onnxruntime_USE_TENSORRT)
file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.cc"
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cu_inc/common.cuh"
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cuh"
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops.h"
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu"
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.h"
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
)
@ -424,15 +418,17 @@ if (onnxruntime_USE_TENSORRT)
else()
target_link_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDA_HOME}/lib64)
endif()
target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf flatbuffers)
target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf)
target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tensorrt DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CUDA)
set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CXX)
set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime")
target_compile_definitions(onnxruntime_providers_tensorrt PRIVATE ONNXIFI_BUILD_LIBRARY=1)
target_compile_options(onnxruntime_providers_tensorrt PRIVATE ${DISABLED_WARNINGS_FOR_TRT})
if (WIN32)
target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4267 /wd4244 /wd4996 /wd4456)
endif()
if(APPLE)
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/tensorrt/exported_symbols.lst")

View file

@ -26,9 +26,6 @@ class IDataTransfer {
// batched copy. default implementation copies each entry sequentially, and returns on first failure.
virtual common::Status CopyTensors(const std::vector<SrcDstPair>& src_dst_pairs) const;
// If this is really a Provider_IDataTransfer, this returns true. Used to convert back & forth with providers efficiently
virtual bool IsProviderInterface() const { return false; }
};
class CPUDataTransfer : public IDataTransfer {

View file

@ -15,6 +15,15 @@
#include "core/session/inference_session.h"
#include "core/session/abi_session_options_impl.h"
#include "core/session/ort_apis.h"
#ifdef USE_TENSORRT
#include "core/providers/tensorrt/tensorrt_provider_factory.h"
#include "core/providers/cuda/cuda_allocator.h"
#include "core/providers/cuda/gpu_data_transfer.h"
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
#include "core/providers/cuda/cuda_common.h"
#endif
#define PROVIDER_BRIDGE_ORT
#include "core/providers/shared_library/provider_interfaces.h"
#include "onnx/common/stl_backports.h"
@ -60,8 +69,6 @@ struct Provider_IAllocator_Impl : Provider_IAllocator {
void* Alloc(size_t size) override { return p_->Alloc(size); }
void Free(void* p) override { return p_->Free(p); }
FencePtr CreateFence(const Provider_SessionState* session_state) override { return p_->CreateFence(reinterpret_cast<const SessionState*>(session_state)); }
bool IsProviderInterface() const override { return false; }
std::unique_ptr<IAllocator> p_;
@ -74,22 +81,9 @@ struct ProviderAllocator : IAllocator {
void* Alloc(size_t size) override { return p_->Alloc(size); }
void Free(void* p) override { return p_->Free(p); }
FencePtr CreateFence(const SessionState* session_state) override { return p_->CreateFence(reinterpret_cast<const Provider_SessionState*>(session_state)); }
std::shared_ptr<Provider_IAllocator> p_;
};
struct IDataTransfer_Wrapper : IDataTransfer {
IDataTransfer_Wrapper(std::unique_ptr<Provider_IDataTransfer> p) : p_{std::move(p)} {}
bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override { return p_->CanCopy(src_device, dst_device); }
common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override { return p_->CopyTensor(*reinterpret_cast<const Provider_Tensor*>(&src), *reinterpret_cast<Provider_Tensor*>(&dst), exec_queue_id); }
bool IsProviderInterface() const override { return true; }
std::unique_ptr<Provider_IDataTransfer> p_;
};
struct Provider_TensorShapeProto_Dimension_Iterator_Impl : Provider_TensorShapeProto_Dimension_Iterator {
Provider_TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
@ -210,11 +204,7 @@ struct Provider_IExecutionProvider_Router_Impl : Provider_IExecutionProvider_Rou
}
std::unique_ptr<IDataTransfer> GetDataTransfer() const override {
auto internal = outer_->Provider_GetDataTransfer();
if (internal)
return onnxruntime::make_unique<IDataTransfer_Wrapper>(std::move(internal));
else
return nullptr;
return std::unique_ptr<IDataTransfer>(reinterpret_cast<IDataTransfer*>(outer_->Provider_GetDataTransfer().release()));
}
void Provider_InsertAllocator(Provider_AllocatorPtr allocator) override {
@ -237,8 +227,8 @@ struct ProviderHostImpl : ProviderHost {
AllocatorCreationInfo info_real{
[&info](int value) -> std::unique_ptr<IAllocator> {
auto allocator = info.factory(value);
// If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IDeviceAllocator
// Otherwise it's really a Provider_IDeviceAllocator_Impl, so we can just unwrap it to get back to the IDeviceAllocator inside
// If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IAllocator
// Otherwise it's really a Provider_IAllocator_Impl, so we can just unwrap it to get back to the IAllocator inside
if (allocator->IsProviderInterface())
return onnxruntime::make_unique<ProviderAllocator>(std::move(allocator));
else
@ -264,6 +254,31 @@ struct ProviderHostImpl : ProviderHost {
return onnxruntime::make_unique<Provider_IExecutionProvider_Router_Impl>(outer, type);
};
#ifdef USE_TENSORRT
std::unique_ptr<Provider_IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override {
return onnxruntime::make_unique<Provider_IAllocator_Impl>(onnxruntime::make_unique<CUDAAllocator>(device_id, name));
}
std::unique_ptr<Provider_IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override {
return onnxruntime::make_unique<Provider_IAllocator_Impl>(onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, name));
}
std::unique_ptr<Provider_IDataTransfer> CreateGPUDataTransfer() override {
return std::unique_ptr<Provider_IDataTransfer>(reinterpret_cast<Provider_IDataTransfer*>(new GPUDataTransfer()));
}
void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override {
return cuda::Impl_Cast(input_data, output_data, count);
}
void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override {
return cuda::Impl_Cast(input_data, output_data, count);
}
bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
#endif
std::string GetEnvironmentVar(const std::string& var_name) override {
return Env::Default().GetEnvironmentVar(var_name);
}
@ -379,12 +394,9 @@ struct ProviderHostImpl : ProviderHost {
// Provider_DataTransferManager
Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) override { return reinterpret_cast<const DataTransferManager*>(p)->CopyTensor(*reinterpret_cast<const Tensor*>(&src), *reinterpret_cast<Tensor*>(&dst), exec_queue_id); }
const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) override {
auto* data_transfer = reinterpret_cast<const DataTransferManager*>(p)->GetDataTransfer(src_device, dst_device);
if (data_transfer->IsProviderInterface())
return reinterpret_cast<const IDataTransfer_Wrapper*>(data_transfer)->p_.get();
return nullptr;
}
// Provider_IDataTransfer
void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) override { delete reinterpret_cast<Provider_IDataTransfer*>(p); }
// Provider_IndexedSubGraph_MetaDef
std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Provider_IndexedSubGraph_MetaDef__construct() override { return std::unique_ptr<Provider_IndexedSubGraph_MetaDef>(reinterpret_cast<Provider_IndexedSubGraph_MetaDef*>(new IndexedSubGraph::MetaDef())); }
@ -585,9 +597,6 @@ struct ProviderHostImpl : ProviderHost {
const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept override { return *reinterpret_cast<const Provider_DataTransferManager*>(&reinterpret_cast<const OpKernelInfo*>(p)->GetDataTransferManager()); }
int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept override { return reinterpret_cast<const OpKernelInfo*>(p)->GetKernelDef().ExecQueueId(); }
// Provider_SessionState
const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) override { return *reinterpret_cast<const Provider_DataTransferManager*>(&reinterpret_cast<const SessionState*>(p)->GetDataTransferMgr()); }
// Provider_Tensor
float* Provider_Tensor__MutableData_float(Provider_Tensor* p) override { return reinterpret_cast<Tensor*>(p)->MutableData<float>(); }
const float* Provider_Tensor__Data_float(const Provider_Tensor* p) override { return reinterpret_cast<const Tensor*>(p)->Data<float>(); }

View file

@ -29,16 +29,16 @@ constexpr const char* DNNL_CPU = "DnnlCpu";
DNNLExecutionProvider::DNNLExecutionProvider(const DNNLExecutionProviderInfo& info)
: Provider_IExecutionProvider{onnxruntime::kDnnlExecutionProvider} {
Provider_AllocatorCreationInfo default_memory_info(
[](int) {
{[](int) {
return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL, OrtAllocatorType::OrtDeviceAllocator));
},
}},
0, info.create_arena);
Provider_AllocatorCreationInfo cpu_memory_info(
[](int) {
{[](int) {
return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0,
OrtMemTypeCPUOutput));
},
}},
0, info.create_arena);
Provider_InsertAllocator(CreateAllocator(default_memory_info));

View file

@ -88,6 +88,13 @@ constexpr const char* kMSDomain = "com.microsoft";
constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider";
enum CUDAStreamType : int {
kCudaStreamDefault = 0,
kCudaStreamCopyIn,
kCudaStreamCopyOut,
kTotalCudaStreams,
};
class DataTypeImpl {
public:
virtual ~DataTypeImpl() = default;
@ -104,8 +111,12 @@ template <typename T>
using IAllocatorUniquePtr = std::unique_ptr<T, std::function<void(T*)>>;
std::unique_ptr<Provider_IAllocator> Provider_CreateCPUAllocator(const OrtMemoryInfo& memory_info);
std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAAllocator(int16_t device_id, const char* name);
std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name);
Provider_AllocatorPtr CreateAllocator(const Provider_AllocatorCreationInfo& info);
std::unique_ptr<Provider_IDataTransfer> Provider_CreateGPUDataTransfer();
std::string GetEnvironmentVar(const std::string& var_name);
class CPUIDInfo {

View file

@ -146,6 +146,20 @@ std::unique_ptr<Provider_IAllocator> Provider_CreateCPUAllocator(const OrtMemory
return g_host->CreateCPUAllocator(info);
}
#ifdef USE_TENSORRT
std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAAllocator(int16_t device_id, const char* name) {
return g_host->CreateCUDAAllocator(device_id, name);
}
std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name) {
return g_host->CreateCUDAPinnedAllocator(device_id, name);
}
std::unique_ptr<Provider_IDataTransfer> Provider_CreateGPUDataTransfer() {
return g_host->CreateGPUDataTransfer();
}
#endif
std::string GetEnvironmentVar(const std::string& var_name) {
return g_host->GetEnvironmentVar(var_name);
}

View file

@ -5,7 +5,6 @@
// In the future the internal implementations could derive from these to remove the need for the wrapper implementations
#include "core/framework/func_api.h"
#include "core/framework/fence.h" // For FencePtr
#define PROVIDER_DISALLOW_ALL(TypeName) \
TypeName() = delete; \
@ -63,7 +62,6 @@ struct Provider_NodeAttributes;
struct Provider_OpKernel_Base;
struct Provider_OpKernelContext;
struct Provider_OpKernelInfo;
struct Provider_SessionState;
struct Provider_Tensor;
class TensorShape;
@ -119,8 +117,6 @@ struct Provider_IAllocator {
virtual void Free(void* p) = 0;
const OrtMemoryInfo& Info() const { return memory_info_; };
virtual FencePtr CreateFence(const Provider_SessionState* /*session_state*/) { return nullptr; }
virtual bool IsProviderInterface() const { return true; }
template <typename T>
@ -259,17 +255,6 @@ struct Provider_IExecutionProvider {
void operator=(const Provider_IExecutionProvider&) = delete;
};
struct Provider_IDataTransfer {
Provider_IDataTransfer() = default;
virtual ~Provider_IDataTransfer() {}
virtual bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const = 0;
virtual common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const = 0;
Provider_IDataTransfer(const Provider_IDataTransfer&) = delete;
void operator=(const Provider_IDataTransfer&) = delete;
};
struct Provider {
virtual std::shared_ptr<Provider_IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) = 0;
};
@ -286,6 +271,18 @@ struct ProviderHost {
virtual std::unique_ptr<Provider_IAllocator> CreateCPUAllocator(const OrtMemoryInfo& memory_info) = 0;
#ifdef USE_TENSORRT
virtual std::unique_ptr<Provider_IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) = 0;
virtual std::unique_ptr<Provider_IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) = 0;
virtual std::unique_ptr<Provider_IDataTransfer> CreateGPUDataTransfer() = 0;
virtual void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) = 0;
virtual void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) = 0;
virtual bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
virtual bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
#endif
virtual std::unique_ptr<Provider_IExecutionProvider_Router> Create_IExecutionProvider_Router(Provider_IExecutionProvider* outer, const std::string& type) = 0;
virtual std::string GetEnvironmentVar(const std::string& var_name) = 0;
@ -384,7 +381,9 @@ struct ProviderHost {
// Provider_DataTransferManager
virtual Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) = 0;
virtual const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) = 0;
// Provider_IDataTransfer
virtual void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) = 0;
// Provider_IndexedSubGraph_MetaDef
virtual std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Provider_IndexedSubGraph_MetaDef__construct() = 0;
@ -541,9 +540,6 @@ struct ProviderHost {
virtual const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept = 0;
virtual int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept = 0;
// Provider_SessionState
virtual const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) = 0;
// Provider_Tensor
virtual float* Provider_Tensor__MutableData_float(Provider_Tensor* p) = 0;
virtual const float* Provider_Tensor__Data_float(const Provider_Tensor* p) = 0;
@ -686,11 +682,18 @@ struct Provider_ComputeCapability {
struct Provider_DataTransferManager {
Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const { return g_host->Provider_DataTransferManager__CopyTensor(this, src, dst, exec_queue_id); }
const Provider_IDataTransfer* GetProviderDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const { return g_host->Provider_DataTransferManager__GetProviderDataTransfer(this, src_device, dst_device); }
PROVIDER_DISALLOW_ALL(Provider_DataTransferManager)
};
struct Provider_IDataTransfer {
static void operator delete(void* p) { g_host->Provider_IDataTransfer__operator_delete(reinterpret_cast<Provider_IDataTransfer*>(p)); }
Provider_IDataTransfer() = delete;
Provider_IDataTransfer(const Provider_IDataTransfer&) = delete;
void operator=(const Provider_IDataTransfer&) = delete;
};
struct Provider_IndexedSubGraph_MetaDef {
static std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Create() { return g_host->Provider_IndexedSubGraph_MetaDef__construct(); }
static void operator delete(void* p) { g_host->Provider_IndexedSubGraph_MetaDef__operator_delete(reinterpret_cast<Provider_IndexedSubGraph_MetaDef*>(p)); }
@ -1015,10 +1018,6 @@ inline Status Provider_OpKernelInfo::GetAttr<float>(const std::string& name, flo
return GetAttr(name, value);
}
struct Provider_SessionState {
const Provider_DataTransferManager& GetDataTransferManager() const { return g_host->Provider_SessionState__GetDataTransferManager(this); }
};
struct Provider_Tensor {
float* MutableData_float() { return g_host->Provider_Tensor__MutableData_float(this); }
const float* Data_float() const { return g_host->Provider_Tensor__Data_float(this); }

View file

@ -1,146 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/cuda/shared_inc/cuda_call.h"
#ifdef _WIN32
#else // POSIX
#include <unistd.h>
#include <string.h>
#endif
namespace onnxruntime {
using namespace common;
template <typename ERRTYPE>
const char* CudaErrString(ERRTYPE x) {
ORT_NOT_IMPLEMENTED();
}
#define CASE_ENUM_TO_STR(x) \
case x: \
return #x
template <>
const char* CudaErrString<cudaError_t>(cudaError_t x) {
cudaDeviceSynchronize();
return cudaGetErrorString(x);
}
template <>
const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
cudaDeviceSynchronize();
switch (e) {
CASE_ENUM_TO_STR(CUBLAS_STATUS_SUCCESS);
CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
CASE_ENUM_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
CASE_ENUM_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
CASE_ENUM_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
CASE_ENUM_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
CASE_ENUM_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
CASE_ENUM_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
CASE_ENUM_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
default:
return "(look for CUBLAS_STATUS_xxx in cublas_api.h)";
}
}
template <>
const char* CudaErrString<curandStatus>(curandStatus) {
cudaDeviceSynchronize();
return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)";
}
template <>
const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t e) {
cudaDeviceSynchronize();
return cudnnGetErrorString(e);
}
template <>
const char* CudaErrString<cufftResult>(cufftResult e) {
cudaDeviceSynchronize();
switch (e) {
CASE_ENUM_TO_STR(CUFFT_SUCCESS);
CASE_ENUM_TO_STR(CUFFT_ALLOC_FAILED);
CASE_ENUM_TO_STR(CUFFT_INVALID_VALUE);
CASE_ENUM_TO_STR(CUFFT_INTERNAL_ERROR);
CASE_ENUM_TO_STR(CUFFT_SETUP_FAILED);
CASE_ENUM_TO_STR(CUFFT_INVALID_SIZE);
default:
return "Unknown cufft error status";
}
}
#ifdef USE_NCCL
template <>
const char* CudaErrString<ncclResult_t>(ncclResult_t e) {
cudaDeviceSynchronize();
return ncclGetErrorString(e);
}
#endif
template <typename ERRTYPE, bool THRW>
bool CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg) {
if (retCode != successCode) {
try {
#ifdef _WIN32
auto del = [](char* p) { free(p); };
std::unique_ptr<char, decltype(del)> hostname_ptr(nullptr, del);
size_t hostname_len = 0;
char* hostname = nullptr;
if (-1 == _dupenv_s(&hostname, &hostname_len, "COMPUTERNAME"))
hostname = "?";
else
hostname_ptr.reset(hostname);
#else
char hostname[HOST_NAME_MAX];
if (gethostname(hostname, HOST_NAME_MAX) != 0)
strcpy(hostname, "?");
#endif
int currentCudaDevice;
cudaGetDevice(&currentCudaDevice);
cudaGetLastError(); // clear last CUDA error
static char str[1024];
snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; expr=%s; %s",
libName, (int)retCode, CudaErrString(retCode), currentCudaDevice,
hostname,
exprString, msg);
if (THRW) {
// throw an exception with the error info
ORT_THROW(str);
} else {
LOGS_DEFAULT(ERROR) << str;
}
} catch (const std::exception& e) { // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error
if (THRW) {
ORT_THROW(e.what());
} else {
LOGS_DEFAULT(ERROR) << e.what();
}
}
return false;
}
return true;
}
template bool CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg);
template bool CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg);
template bool CudaCall<cublasStatus_t, false>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg);
template bool CudaCall<cublasStatus_t, true>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg);
template bool CudaCall<cudnnStatus_t, false>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg);
template bool CudaCall<cudnnStatus_t, true>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg);
template bool CudaCall<curandStatus_t, false>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg);
template bool CudaCall<curandStatus_t, true>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg);
template bool CudaCall<cufftResult, false>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg);
template bool CudaCall<cufftResult, true>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg);
#ifdef USE_NCCL
template bool CudaCall<ncclResult_t, false>(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg);
#endif
} // namespace onnxruntime

View file

@ -1,70 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/cuda/shared_inc/cuda_call.h"
#include "cuda_allocator.h"
#include "cuda_fence.h"
#include "gpu_data_transfer.h"
namespace onnxruntime {
static const GPUDataTransfer* GetGPUDataTransfer(const Provider_SessionState* session_state) {
OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0);
OrtDevice cpu_device;
return static_cast<const GPUDataTransfer*>(session_state->GetDataTransferManager().GetProviderDataTransfer(gpu_device, cpu_device));
}
void CUDAAllocator::CheckDevice(bool throw_when_fail) const {
#ifndef NDEBUG
// check device to match at debug build
// if it's expected to change, call cudaSetDevice instead of the check
int current_device;
auto cuda_err = cudaGetDevice(&current_device);
if (cuda_err == cudaSuccess) {
ORT_ENFORCE(current_device == Info().id);
} else if (throw_when_fail) {
CUDA_CALL_THROW(cuda_err);
}
#else
ORT_UNUSED_PARAMETER(throw_when_fail);
#endif
}
void* CUDAAllocator::Alloc(size_t size) {
CheckDevice(true);
void* p = nullptr;
if (size > 0) {
//BFCArena was updated recently to handle the exception and adjust the request size
CUDA_CALL_THROW(cudaMalloc((void**)&p, size));
}
return p;
}
void CUDAAllocator::Free(void* p) {
CheckDevice(false); // ignore CUDA failure when free
cudaFree(p); // do not throw error since it's OK for cudaFree to fail during shutdown
}
FencePtr CUDAAllocator::CreateFence(const Provider_SessionState* session_state) {
return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
}
void* CUDAPinnedAllocator::Alloc(size_t size) {
void* p = nullptr;
if (size > 0) {
CUDA_CALL_THROW(cudaMallocHost((void**)&p, size));
}
return p;
}
void CUDAPinnedAllocator::Free(void* p) {
CUDA_CALL_THROW(cudaFreeHost(p));
}
FencePtr CUDAPinnedAllocator::CreateFence(const Provider_SessionState* session_state) {
return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
}
} // namespace onnxruntime

View file

@ -1,36 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace onnxruntime {
class CUDAAllocator : public Provider_IAllocator {
public:
CUDAAllocator(OrtDevice::DeviceId device_id, const char* name)
: Provider_IAllocator(
OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id),
device_id, OrtMemTypeDefault)) {}
void* Alloc(size_t size) override;
void Free(void* p) override;
FencePtr CreateFence(const Provider_SessionState* session_state) override;
private:
void CheckDevice(bool throw_when_fail) const;
};
//TODO: add a default constructor
class CUDAPinnedAllocator : public Provider_IAllocator {
public:
CUDAPinnedAllocator(OrtDevice::DeviceId device_id, const char* name)
: Provider_IAllocator(
OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id),
device_id, OrtMemTypeCPUOutput)) {}
void* Alloc(size_t size) override;
void Free(void* p) override;
FencePtr CreateFence(const Provider_SessionState* session_state) override;
};
} // namespace onnxruntime

View file

@ -1,68 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/cuda/shared_inc/cuda_call.h"
#include "cuda_allocator.h"
#include "cuda_fence.h"
#include "gpu_data_transfer.h"
namespace onnxruntime {
constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
CUDAFence::CUDAFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) {
// NOTE: cudaEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel
// if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU
int event_flags = /*cudaEventBlockingSync |*/ cudaEventDisableTiming;
CUDA_CALL_THROW(cudaEventCreate(&read_event_, event_flags));
CUDA_CALL_THROW(cudaEventCreate(&write_event_, event_flags));
}
CUDAFence::~CUDAFence() {
CUDA_CALL_THROW(cudaEventDestroy(read_event_));
CUDA_CALL_THROW(cudaEventDestroy(write_event_));
}
void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int async_queue_id) {
if (provider_type == onnxruntime::kCudaExecutionProvider) {
// sync in GPU, the call is non-blocking on CPU
CUDA_CALL_THROW(cudaStreamWaitEvent(data_transfer_->GetStream(async_queue_id), write_event_, 0));
} else {
// sync on CPU for all other providers, this is blocking
CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
}
}
void CUDAFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) {
if (provider_type == onnxruntime::kCudaExecutionProvider) {
// sync in GPU, the call is non-blocking on CPU
cudaStream_t stream = data_transfer_->GetStream(queue_id);
CUDA_CALL_THROW(cudaStreamWaitEvent(stream, read_event_, 0));
CUDA_CALL_THROW(cudaStreamWaitEvent(stream, write_event_, 0));
} else {
// sync on CPU for all other providers, this is blocking
CUDA_CALL_THROW(cudaEventSynchronize(read_event_));
CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
}
}
bool CUDAFence::CanRelease() {
return cudaEventQuery(read_event_) == cudaSuccess &&
cudaEventQuery(write_event_) == cudaSuccess;
}
void CUDAFence::AfterUsedAsInput(int queue_id) {
// update read fence
cudaStream_t stream = data_transfer_->GetStream(queue_id);
CUDA_CALL_THROW(cudaEventRecord(read_event_, stream));
}
void CUDAFence::AfterUsedAsOutput(int queue_id) {
// update write fence
cudaStream_t stream = data_transfer_->GetStream(queue_id);
CUDA_CALL_THROW(cudaEventRecord(write_event_, stream));
}
} // namespace onnxruntime

View file

@ -1,25 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace onnxruntime {
class GPUDataTransfer;
class CUDAFence : public IFence {
public:
CUDAFence(const GPUDataTransfer* data_transfer);
virtual ~CUDAFence();
virtual void BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int queue_id) override;
virtual void BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) override;
virtual void AfterUsedAsInput(int queue_id) override;
virtual void AfterUsedAsOutput(int queue_id) override;
virtual bool CanRelease() override;
private:
cudaEvent_t read_event_;
cudaEvent_t write_event_;
const GPUDataTransfer* data_transfer_;
};
} // namespace onnxruntime

View file

@ -1,68 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/cuda/shared_inc/cuda_call.h"
#include "gpu_data_transfer.h"
#define CUDA_RETURN_IF_ERROR(expr) \
ORT_RETURN_IF_ERROR(CUDA_CALL(expr) \
? common::Status::OK() \
: ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA error executing ", #expr))
namespace onnxruntime {
GPUDataTransfer::GPUDataTransfer() {
// create streams, default is nullptr
streams_[kCudaStreamDefault] = nullptr;
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking));
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
}
GPUDataTransfer::~GPUDataTransfer() {
CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn]));
CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut]));
}
bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
}
common::Status GPUDataTransfer::CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const {
size_t bytes = src.SizeInBytes();
const void* src_data = src.DataRaw();
void* dst_data = dst.MutableDataRaw();
auto& src_device = src.Location().device;
auto& dst_device = dst.Location().device;
if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// copy from pinned memory to GPU, this is non-blocking
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id]));
} else if (src_device.Type() == OrtDevice::GPU) {
// copying between GPU, this is non-blocking
// Copy only if the two addresses are different.
if (dst_data != src_data) {
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault]));
}
} else {
// copy from other CPU memory to GPU, this is blocking
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
}
} else if (src_device.Type() == OrtDevice::GPU) {
if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// copying from GPU to pinned memory, this is non-blocking
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id]));
} else {
// copying from GPU to CPU memory, this is blocking
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
}
} else {
// copying between cpu memory
memcpy(dst_data, src_data, bytes);
}
return Status::OK();
}
} // namespace onnxruntime

View file

@ -1,35 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace onnxruntime {
enum CUDAStreamType : int {
kCudaStreamDefault = 0,
kCudaStreamCopyIn,
kCudaStreamCopyOut,
kTotalCudaStreams,
};
class GPUDataTransfer : public Provider_IDataTransfer {
public:
GPUDataTransfer();
~GPUDataTransfer();
bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
// Suppress MSVC warning about not fully overriding
using Provider_IDataTransfer::CopyTensor;
common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const override;
cudaStream_t GetStream(int queue_id) const {
ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams);
return streams_[queue_id];
}
private:
cudaStream_t streams_[kTotalCudaStreams];
};
} // namespace onnxruntime

View file

@ -2,25 +2,25 @@
// Licensed under the MIT License.
#include <fstream>
#include <limits>
#include <list>
#include <map>
#include <memory>
#include <unordered_set>
#include <unordered_map>
#include <utility>
#include "core/providers/shared_library/provider_api.h"
#define ORT_API_MANUAL_INIT
#include "core/session/onnxruntime_cxx_api.h"
#include "core/common/safeint.h"
#include "tensorrt_execution_provider.h"
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
#include "core/providers/cuda/cuda_allocator.h"
#include "core/providers/cuda/shared_inc/cuda_call.h"
#include "gpu_data_transfer.h"
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
#include "cuda_runtime_api.h"
#include "gsl/gsl"
#include <experimental/filesystem>
#include "cuda_allocator.h"
#include <unordered_map>
#include <utility>
#include <limits>
#include <map>
#include <memory>
#define CUDA_RETURN_IF_ERROR(expr) \
ORT_RETURN_IF_ERROR(CUDA_CALL(expr) \
@ -69,6 +69,33 @@ struct ShutdownProtobuf {
namespace onnxruntime {
namespace cuda {
template <>
void Impl_Cast(
const int64_t* input_data, int32_t* output_data,
size_t count) {
return g_host->cuda__Impl_Cast(input_data, output_data, count);
}
template <>
void Impl_Cast(
const int32_t* input_data, int64_t* output_data,
size_t count) {
return g_host->cuda__Impl_Cast(input_data, output_data, count);
}
} // namespace cuda
template <>
bool CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) {
return g_host->CudaCall_false(retCode, exprString, libName, successCode, msg);
}
template <>
bool CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) {
return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg);
}
constexpr const char* TRT = "Tensorrt";
constexpr const char* TRT_PINNED = "TensorrtPinned";
@ -131,7 +158,7 @@ KernelRegistryAndStatus GetTensorrtKernelRegistry() {
}
std::shared_ptr<Provider_KernelRegistry> TensorrtExecutionProvider::Provider_GetKernelRegistry() const {
static KernelRegistryAndStatus k = GetTensorrtKernelRegistry();
static KernelRegistryAndStatus k = onnxruntime::GetTensorrtKernelRegistry();
// throw if the registry failed to initialize
ORT_THROW_IF_ERROR(k.st);
return k.kernel_registry;
@ -148,12 +175,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
CUDA_CALL_THROW(cudaSetDevice(device_id_));
Provider_AllocatorCreationInfo default_memory_info(
[](int id) { return onnxruntime::make_unique<CUDAAllocator>(id, TRT); }, device_id_);
[](int id) { return Provider_CreateCUDAAllocator(id, TRT); }, device_id_);
allocator_ = CreateAllocator(default_memory_info);
Provider_InsertAllocator(allocator_);
Provider_AllocatorCreationInfo pinned_allocator_info(
[](int) { return onnxruntime::make_unique<CUDAPinnedAllocator>(0, TRT_PINNED); }, device_id_);
[](int) { return Provider_CreateCUDAPinnedAllocator(0, TRT_PINNED); }, device_id_);
Provider_InsertAllocator(CreateAllocator(pinned_allocator_info));
// Get environment variables
@ -209,7 +236,7 @@ Provider_AllocatorPtr TensorrtExecutionProvider::Provider_GetAllocator(int id, O
}
std::unique_ptr<onnxruntime::Provider_IDataTransfer> TensorrtExecutionProvider::Provider_GetDataTransfer() const {
return onnxruntime::make_unique<GPUDataTransfer>();
return onnxruntime::Provider_CreateGPUDataTransfer();
}
// Convert GraphViewer graph to GraphProto
@ -1345,5 +1372,4 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vector<onn
return Status::OK();
}
} // namespace onnxruntime

View file

@ -2,14 +2,6 @@
// Licensed under the MIT License.
#pragma once
#ifdef _WIN32
#pragma warning(disable : 4244) // '=': conversion from '_Ty2' to '_Ty', possible loss of data
#pragma warning(disable : 4267) // 'initializing': conversion from '<type1>' to '<type2>', possible loss of data
#pragma warning(disable : 4996) // 'gmtime': This function or variable may be unsafe. Consider using gmtime_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
#pragma warning(disable : 4456) // declaration of 'i' hides previous local declaration
#endif
#include <ctime>
#include "NvInfer.h"
#include "NvOnnxParser.h"