mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-21 02:18:09 +00:00
parent
089789c135
commit
8fa427b264
17 changed files with 142 additions and 541 deletions
|
|
@ -24,7 +24,12 @@ add_library(onnxruntime_framework ${onnxruntime_framework_srcs})
|
|||
if(onnxruntime_ENABLE_INSTRUMENT)
|
||||
target_compile_definitions(onnxruntime_framework PRIVATE ONNXRUNTIME_ENABLE_INSTRUMENT)
|
||||
endif()
|
||||
if(onnxruntime_USE_TENSORRT)
|
||||
# TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP
|
||||
target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_CUDNN_HOME}/include PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
else()
|
||||
target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
|
||||
endif()
|
||||
onnxruntime_add_include_to_target(onnxruntime_framework onnxruntime_common onnx onnx_proto protobuf::libprotobuf flatbuffers)
|
||||
set_target_properties(onnxruntime_framework PROPERTIES FOLDER "ONNXRuntime")
|
||||
# need onnx to build to create headers that this project includes
|
||||
|
|
|
|||
|
|
@ -311,7 +311,6 @@ if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_DNNL)
|
|||
if(APPLE)
|
||||
set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
|
||||
elseif(UNIX)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
|
||||
set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
|
||||
elseif(WIN32)
|
||||
set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
|
||||
|
|
@ -338,7 +337,6 @@ if (onnxruntime_USE_DNNL)
|
|||
add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
|
||||
target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR})
|
||||
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
|
||||
|
||||
target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl onnxruntime_providers_shared)
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dnnl DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
|
||||
set_target_properties(onnxruntime_providers_dnnl PROPERTIES FOLDER "ONNXRuntime")
|
||||
|
|
@ -376,6 +374,7 @@ if (onnxruntime_USE_TENSORRT)
|
|||
endif()
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -include algorithm")
|
||||
set(PROTOBUF_LIBRARY libprotobuf)
|
||||
set(DISABLED_WARNINGS_FOR_TRT /wd4267 /wd4244 /wd4996 /wd4456)
|
||||
endif()
|
||||
if ( CMAKE_COMPILER_IS_GNUCC )
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-missing-field-initializers")
|
||||
|
|
@ -406,11 +405,6 @@ if (onnxruntime_USE_TENSORRT)
|
|||
file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.h"
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.cc"
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cu_inc/common.cuh"
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cuh"
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops.h"
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu"
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.h"
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
|
||||
)
|
||||
|
|
@ -424,15 +418,17 @@ if (onnxruntime_USE_TENSORRT)
|
|||
else()
|
||||
target_link_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDA_HOME}/lib64)
|
||||
endif()
|
||||
target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf flatbuffers)
|
||||
target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart onnxruntime_providers_shared protobuf::libprotobuf)
|
||||
target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
|
||||
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tensorrt DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
|
||||
set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CUDA)
|
||||
set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CXX)
|
||||
set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime")
|
||||
target_compile_definitions(onnxruntime_providers_tensorrt PRIVATE ONNXIFI_BUILD_LIBRARY=1)
|
||||
target_compile_options(onnxruntime_providers_tensorrt PRIVATE ${DISABLED_WARNINGS_FOR_TRT})
|
||||
if (WIN32)
|
||||
target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4267 /wd4244 /wd4996 /wd4456)
|
||||
endif()
|
||||
|
||||
if(APPLE)
|
||||
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/tensorrt/exported_symbols.lst")
|
||||
|
|
|
|||
|
|
@ -26,9 +26,6 @@ class IDataTransfer {
|
|||
|
||||
// batched copy. default implementation copies each entry sequentially, and returns on first failure.
|
||||
virtual common::Status CopyTensors(const std::vector<SrcDstPair>& src_dst_pairs) const;
|
||||
|
||||
// If this is really a Provider_IDataTransfer, this returns true. Used to convert back & forth with providers efficiently
|
||||
virtual bool IsProviderInterface() const { return false; }
|
||||
};
|
||||
|
||||
class CPUDataTransfer : public IDataTransfer {
|
||||
|
|
|
|||
|
|
@ -15,6 +15,15 @@
|
|||
#include "core/session/inference_session.h"
|
||||
#include "core/session/abi_session_options_impl.h"
|
||||
#include "core/session/ort_apis.h"
|
||||
|
||||
#ifdef USE_TENSORRT
|
||||
#include "core/providers/tensorrt/tensorrt_provider_factory.h"
|
||||
#include "core/providers/cuda/cuda_allocator.h"
|
||||
#include "core/providers/cuda/gpu_data_transfer.h"
|
||||
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
#endif
|
||||
|
||||
#define PROVIDER_BRIDGE_ORT
|
||||
#include "core/providers/shared_library/provider_interfaces.h"
|
||||
#include "onnx/common/stl_backports.h"
|
||||
|
|
@ -60,8 +69,6 @@ struct Provider_IAllocator_Impl : Provider_IAllocator {
|
|||
void* Alloc(size_t size) override { return p_->Alloc(size); }
|
||||
void Free(void* p) override { return p_->Free(p); }
|
||||
|
||||
FencePtr CreateFence(const Provider_SessionState* session_state) override { return p_->CreateFence(reinterpret_cast<const SessionState*>(session_state)); }
|
||||
|
||||
bool IsProviderInterface() const override { return false; }
|
||||
|
||||
std::unique_ptr<IAllocator> p_;
|
||||
|
|
@ -74,22 +81,9 @@ struct ProviderAllocator : IAllocator {
|
|||
void* Alloc(size_t size) override { return p_->Alloc(size); }
|
||||
void Free(void* p) override { return p_->Free(p); }
|
||||
|
||||
FencePtr CreateFence(const SessionState* session_state) override { return p_->CreateFence(reinterpret_cast<const Provider_SessionState*>(session_state)); }
|
||||
|
||||
std::shared_ptr<Provider_IAllocator> p_;
|
||||
};
|
||||
|
||||
struct IDataTransfer_Wrapper : IDataTransfer {
|
||||
IDataTransfer_Wrapper(std::unique_ptr<Provider_IDataTransfer> p) : p_{std::move(p)} {}
|
||||
|
||||
bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override { return p_->CanCopy(src_device, dst_device); }
|
||||
common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override { return p_->CopyTensor(*reinterpret_cast<const Provider_Tensor*>(&src), *reinterpret_cast<Provider_Tensor*>(&dst), exec_queue_id); }
|
||||
|
||||
bool IsProviderInterface() const override { return true; }
|
||||
|
||||
std::unique_ptr<Provider_IDataTransfer> p_;
|
||||
};
|
||||
|
||||
struct Provider_TensorShapeProto_Dimension_Iterator_Impl : Provider_TensorShapeProto_Dimension_Iterator {
|
||||
Provider_TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
|
||||
|
||||
|
|
@ -210,11 +204,7 @@ struct Provider_IExecutionProvider_Router_Impl : Provider_IExecutionProvider_Rou
|
|||
}
|
||||
|
||||
std::unique_ptr<IDataTransfer> GetDataTransfer() const override {
|
||||
auto internal = outer_->Provider_GetDataTransfer();
|
||||
if (internal)
|
||||
return onnxruntime::make_unique<IDataTransfer_Wrapper>(std::move(internal));
|
||||
else
|
||||
return nullptr;
|
||||
return std::unique_ptr<IDataTransfer>(reinterpret_cast<IDataTransfer*>(outer_->Provider_GetDataTransfer().release()));
|
||||
}
|
||||
|
||||
void Provider_InsertAllocator(Provider_AllocatorPtr allocator) override {
|
||||
|
|
@ -237,8 +227,8 @@ struct ProviderHostImpl : ProviderHost {
|
|||
AllocatorCreationInfo info_real{
|
||||
[&info](int value) -> std::unique_ptr<IAllocator> {
|
||||
auto allocator = info.factory(value);
|
||||
// If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IDeviceAllocator
|
||||
// Otherwise it's really a Provider_IDeviceAllocator_Impl, so we can just unwrap it to get back to the IDeviceAllocator inside
|
||||
// If the allocator is a provider interface, we need to wrap it with ProviderAllocator to turn it into an IAllocator
|
||||
// Otherwise it's really a Provider_IAllocator_Impl, so we can just unwrap it to get back to the IAllocator inside
|
||||
if (allocator->IsProviderInterface())
|
||||
return onnxruntime::make_unique<ProviderAllocator>(std::move(allocator));
|
||||
else
|
||||
|
|
@ -264,6 +254,31 @@ struct ProviderHostImpl : ProviderHost {
|
|||
return onnxruntime::make_unique<Provider_IExecutionProvider_Router_Impl>(outer, type);
|
||||
};
|
||||
|
||||
#ifdef USE_TENSORRT
|
||||
std::unique_ptr<Provider_IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override {
|
||||
return onnxruntime::make_unique<Provider_IAllocator_Impl>(onnxruntime::make_unique<CUDAAllocator>(device_id, name));
|
||||
}
|
||||
|
||||
std::unique_ptr<Provider_IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override {
|
||||
return onnxruntime::make_unique<Provider_IAllocator_Impl>(onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, name));
|
||||
}
|
||||
|
||||
std::unique_ptr<Provider_IDataTransfer> CreateGPUDataTransfer() override {
|
||||
return std::unique_ptr<Provider_IDataTransfer>(reinterpret_cast<Provider_IDataTransfer*>(new GPUDataTransfer()));
|
||||
}
|
||||
|
||||
void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override {
|
||||
return cuda::Impl_Cast(input_data, output_data, count);
|
||||
}
|
||||
|
||||
void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override {
|
||||
return cuda::Impl_Cast(input_data, output_data, count);
|
||||
}
|
||||
|
||||
bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
|
||||
bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
|
||||
#endif
|
||||
|
||||
std::string GetEnvironmentVar(const std::string& var_name) override {
|
||||
return Env::Default().GetEnvironmentVar(var_name);
|
||||
}
|
||||
|
|
@ -379,12 +394,9 @@ struct ProviderHostImpl : ProviderHost {
|
|||
|
||||
// Provider_DataTransferManager
|
||||
Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) override { return reinterpret_cast<const DataTransferManager*>(p)->CopyTensor(*reinterpret_cast<const Tensor*>(&src), *reinterpret_cast<Tensor*>(&dst), exec_queue_id); }
|
||||
const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) override {
|
||||
auto* data_transfer = reinterpret_cast<const DataTransferManager*>(p)->GetDataTransfer(src_device, dst_device);
|
||||
if (data_transfer->IsProviderInterface())
|
||||
return reinterpret_cast<const IDataTransfer_Wrapper*>(data_transfer)->p_.get();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Provider_IDataTransfer
|
||||
void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) override { delete reinterpret_cast<Provider_IDataTransfer*>(p); }
|
||||
|
||||
// Provider_IndexedSubGraph_MetaDef
|
||||
std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Provider_IndexedSubGraph_MetaDef__construct() override { return std::unique_ptr<Provider_IndexedSubGraph_MetaDef>(reinterpret_cast<Provider_IndexedSubGraph_MetaDef*>(new IndexedSubGraph::MetaDef())); }
|
||||
|
|
@ -585,9 +597,6 @@ struct ProviderHostImpl : ProviderHost {
|
|||
const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept override { return *reinterpret_cast<const Provider_DataTransferManager*>(&reinterpret_cast<const OpKernelInfo*>(p)->GetDataTransferManager()); }
|
||||
int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept override { return reinterpret_cast<const OpKernelInfo*>(p)->GetKernelDef().ExecQueueId(); }
|
||||
|
||||
// Provider_SessionState
|
||||
const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) override { return *reinterpret_cast<const Provider_DataTransferManager*>(&reinterpret_cast<const SessionState*>(p)->GetDataTransferMgr()); }
|
||||
|
||||
// Provider_Tensor
|
||||
float* Provider_Tensor__MutableData_float(Provider_Tensor* p) override { return reinterpret_cast<Tensor*>(p)->MutableData<float>(); }
|
||||
const float* Provider_Tensor__Data_float(const Provider_Tensor* p) override { return reinterpret_cast<const Tensor*>(p)->Data<float>(); }
|
||||
|
|
|
|||
|
|
@ -29,16 +29,16 @@ constexpr const char* DNNL_CPU = "DnnlCpu";
|
|||
DNNLExecutionProvider::DNNLExecutionProvider(const DNNLExecutionProviderInfo& info)
|
||||
: Provider_IExecutionProvider{onnxruntime::kDnnlExecutionProvider} {
|
||||
Provider_AllocatorCreationInfo default_memory_info(
|
||||
[](int) {
|
||||
{[](int) {
|
||||
return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL, OrtAllocatorType::OrtDeviceAllocator));
|
||||
},
|
||||
}},
|
||||
0, info.create_arena);
|
||||
|
||||
Provider_AllocatorCreationInfo cpu_memory_info(
|
||||
[](int) {
|
||||
{[](int) {
|
||||
return onnxruntime::Provider_CreateCPUAllocator(OrtMemoryInfo(DNNL_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0,
|
||||
OrtMemTypeCPUOutput));
|
||||
},
|
||||
}},
|
||||
0, info.create_arena);
|
||||
|
||||
Provider_InsertAllocator(CreateAllocator(default_memory_info));
|
||||
|
|
|
|||
|
|
@ -88,6 +88,13 @@ constexpr const char* kMSDomain = "com.microsoft";
|
|||
constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
|
||||
constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider";
|
||||
|
||||
enum CUDAStreamType : int {
|
||||
kCudaStreamDefault = 0,
|
||||
kCudaStreamCopyIn,
|
||||
kCudaStreamCopyOut,
|
||||
kTotalCudaStreams,
|
||||
};
|
||||
|
||||
class DataTypeImpl {
|
||||
public:
|
||||
virtual ~DataTypeImpl() = default;
|
||||
|
|
@ -104,8 +111,12 @@ template <typename T>
|
|||
using IAllocatorUniquePtr = std::unique_ptr<T, std::function<void(T*)>>;
|
||||
|
||||
std::unique_ptr<Provider_IAllocator> Provider_CreateCPUAllocator(const OrtMemoryInfo& memory_info);
|
||||
std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAAllocator(int16_t device_id, const char* name);
|
||||
std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name);
|
||||
Provider_AllocatorPtr CreateAllocator(const Provider_AllocatorCreationInfo& info);
|
||||
|
||||
std::unique_ptr<Provider_IDataTransfer> Provider_CreateGPUDataTransfer();
|
||||
|
||||
std::string GetEnvironmentVar(const std::string& var_name);
|
||||
|
||||
class CPUIDInfo {
|
||||
|
|
|
|||
|
|
@ -146,6 +146,20 @@ std::unique_ptr<Provider_IAllocator> Provider_CreateCPUAllocator(const OrtMemory
|
|||
return g_host->CreateCPUAllocator(info);
|
||||
}
|
||||
|
||||
#ifdef USE_TENSORRT
|
||||
std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAAllocator(int16_t device_id, const char* name) {
|
||||
return g_host->CreateCUDAAllocator(device_id, name);
|
||||
}
|
||||
|
||||
std::unique_ptr<Provider_IAllocator> Provider_CreateCUDAPinnedAllocator(int16_t device_id, const char* name) {
|
||||
return g_host->CreateCUDAPinnedAllocator(device_id, name);
|
||||
}
|
||||
|
||||
std::unique_ptr<Provider_IDataTransfer> Provider_CreateGPUDataTransfer() {
|
||||
return g_host->CreateGPUDataTransfer();
|
||||
}
|
||||
#endif
|
||||
|
||||
std::string GetEnvironmentVar(const std::string& var_name) {
|
||||
return g_host->GetEnvironmentVar(var_name);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@
|
|||
// In the future the internal implementations could derive from these to remove the need for the wrapper implementations
|
||||
|
||||
#include "core/framework/func_api.h"
|
||||
#include "core/framework/fence.h" // For FencePtr
|
||||
|
||||
#define PROVIDER_DISALLOW_ALL(TypeName) \
|
||||
TypeName() = delete; \
|
||||
|
|
@ -63,7 +62,6 @@ struct Provider_NodeAttributes;
|
|||
struct Provider_OpKernel_Base;
|
||||
struct Provider_OpKernelContext;
|
||||
struct Provider_OpKernelInfo;
|
||||
struct Provider_SessionState;
|
||||
struct Provider_Tensor;
|
||||
class TensorShape;
|
||||
|
||||
|
|
@ -119,8 +117,6 @@ struct Provider_IAllocator {
|
|||
virtual void Free(void* p) = 0;
|
||||
const OrtMemoryInfo& Info() const { return memory_info_; };
|
||||
|
||||
virtual FencePtr CreateFence(const Provider_SessionState* /*session_state*/) { return nullptr; }
|
||||
|
||||
virtual bool IsProviderInterface() const { return true; }
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -259,17 +255,6 @@ struct Provider_IExecutionProvider {
|
|||
void operator=(const Provider_IExecutionProvider&) = delete;
|
||||
};
|
||||
|
||||
struct Provider_IDataTransfer {
|
||||
Provider_IDataTransfer() = default;
|
||||
virtual ~Provider_IDataTransfer() {}
|
||||
|
||||
virtual bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const = 0;
|
||||
virtual common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const = 0;
|
||||
|
||||
Provider_IDataTransfer(const Provider_IDataTransfer&) = delete;
|
||||
void operator=(const Provider_IDataTransfer&) = delete;
|
||||
};
|
||||
|
||||
struct Provider {
|
||||
virtual std::shared_ptr<Provider_IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) = 0;
|
||||
};
|
||||
|
|
@ -286,6 +271,18 @@ struct ProviderHost {
|
|||
|
||||
virtual std::unique_ptr<Provider_IAllocator> CreateCPUAllocator(const OrtMemoryInfo& memory_info) = 0;
|
||||
|
||||
#ifdef USE_TENSORRT
|
||||
virtual std::unique_ptr<Provider_IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) = 0;
|
||||
virtual std::unique_ptr<Provider_IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) = 0;
|
||||
virtual std::unique_ptr<Provider_IDataTransfer> CreateGPUDataTransfer() = 0;
|
||||
|
||||
virtual void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) = 0;
|
||||
virtual void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) = 0;
|
||||
|
||||
virtual bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
|
||||
virtual bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0;
|
||||
#endif
|
||||
|
||||
virtual std::unique_ptr<Provider_IExecutionProvider_Router> Create_IExecutionProvider_Router(Provider_IExecutionProvider* outer, const std::string& type) = 0;
|
||||
|
||||
virtual std::string GetEnvironmentVar(const std::string& var_name) = 0;
|
||||
|
|
@ -384,7 +381,9 @@ struct ProviderHost {
|
|||
|
||||
// Provider_DataTransferManager
|
||||
virtual Status Provider_DataTransferManager__CopyTensor(const Provider_DataTransferManager* p, const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) = 0;
|
||||
virtual const Provider_IDataTransfer* Provider_DataTransferManager__GetProviderDataTransfer(const Provider_DataTransferManager* p, const OrtDevice& src_device, const OrtDevice& dst_device) = 0;
|
||||
|
||||
// Provider_IDataTransfer
|
||||
virtual void Provider_IDataTransfer__operator_delete(Provider_IDataTransfer* p) = 0;
|
||||
|
||||
// Provider_IndexedSubGraph_MetaDef
|
||||
virtual std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Provider_IndexedSubGraph_MetaDef__construct() = 0;
|
||||
|
|
@ -541,9 +540,6 @@ struct ProviderHost {
|
|||
virtual const Provider_DataTransferManager& Provider_OpKernelInfo__GetDataTransferManager(const Provider_OpKernelInfo* p) noexcept = 0;
|
||||
virtual int Provider_OpKernelInfo__GetKernelDef_ExecQueueId(const Provider_OpKernelInfo* p) noexcept = 0;
|
||||
|
||||
// Provider_SessionState
|
||||
virtual const Provider_DataTransferManager& Provider_SessionState__GetDataTransferManager(const Provider_SessionState* p) = 0;
|
||||
|
||||
// Provider_Tensor
|
||||
virtual float* Provider_Tensor__MutableData_float(Provider_Tensor* p) = 0;
|
||||
virtual const float* Provider_Tensor__Data_float(const Provider_Tensor* p) = 0;
|
||||
|
|
@ -686,11 +682,18 @@ struct Provider_ComputeCapability {
|
|||
|
||||
struct Provider_DataTransferManager {
|
||||
Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const { return g_host->Provider_DataTransferManager__CopyTensor(this, src, dst, exec_queue_id); }
|
||||
const Provider_IDataTransfer* GetProviderDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const { return g_host->Provider_DataTransferManager__GetProviderDataTransfer(this, src_device, dst_device); }
|
||||
|
||||
PROVIDER_DISALLOW_ALL(Provider_DataTransferManager)
|
||||
};
|
||||
|
||||
struct Provider_IDataTransfer {
|
||||
static void operator delete(void* p) { g_host->Provider_IDataTransfer__operator_delete(reinterpret_cast<Provider_IDataTransfer*>(p)); }
|
||||
|
||||
Provider_IDataTransfer() = delete;
|
||||
Provider_IDataTransfer(const Provider_IDataTransfer&) = delete;
|
||||
void operator=(const Provider_IDataTransfer&) = delete;
|
||||
};
|
||||
|
||||
struct Provider_IndexedSubGraph_MetaDef {
|
||||
static std::unique_ptr<Provider_IndexedSubGraph_MetaDef> Create() { return g_host->Provider_IndexedSubGraph_MetaDef__construct(); }
|
||||
static void operator delete(void* p) { g_host->Provider_IndexedSubGraph_MetaDef__operator_delete(reinterpret_cast<Provider_IndexedSubGraph_MetaDef*>(p)); }
|
||||
|
|
@ -1015,10 +1018,6 @@ inline Status Provider_OpKernelInfo::GetAttr<float>(const std::string& name, flo
|
|||
return GetAttr(name, value);
|
||||
}
|
||||
|
||||
struct Provider_SessionState {
|
||||
const Provider_DataTransferManager& GetDataTransferManager() const { return g_host->Provider_SessionState__GetDataTransferManager(this); }
|
||||
};
|
||||
|
||||
struct Provider_Tensor {
|
||||
float* MutableData_float() { return g_host->Provider_Tensor__MutableData_float(this); }
|
||||
const float* Data_float() const { return g_host->Provider_Tensor__Data_float(this); }
|
||||
|
|
|
|||
|
|
@ -1,146 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/providers/shared_library/provider_api.h"
|
||||
#include "core/providers/cuda/shared_inc/cuda_call.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#else // POSIX
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
using namespace common;
|
||||
|
||||
template <typename ERRTYPE>
|
||||
const char* CudaErrString(ERRTYPE x) {
|
||||
ORT_NOT_IMPLEMENTED();
|
||||
}
|
||||
|
||||
#define CASE_ENUM_TO_STR(x) \
|
||||
case x: \
|
||||
return #x
|
||||
|
||||
template <>
|
||||
const char* CudaErrString<cudaError_t>(cudaError_t x) {
|
||||
cudaDeviceSynchronize();
|
||||
return cudaGetErrorString(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
switch (e) {
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_SUCCESS);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
|
||||
CASE_ENUM_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
|
||||
default:
|
||||
return "(look for CUBLAS_STATUS_xxx in cublas_api.h)";
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
const char* CudaErrString<curandStatus>(curandStatus) {
|
||||
cudaDeviceSynchronize();
|
||||
return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)";
|
||||
}
|
||||
|
||||
template <>
|
||||
const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t e) {
|
||||
cudaDeviceSynchronize();
|
||||
return cudnnGetErrorString(e);
|
||||
}
|
||||
|
||||
template <>
|
||||
const char* CudaErrString<cufftResult>(cufftResult e) {
|
||||
cudaDeviceSynchronize();
|
||||
switch (e) {
|
||||
CASE_ENUM_TO_STR(CUFFT_SUCCESS);
|
||||
CASE_ENUM_TO_STR(CUFFT_ALLOC_FAILED);
|
||||
CASE_ENUM_TO_STR(CUFFT_INVALID_VALUE);
|
||||
CASE_ENUM_TO_STR(CUFFT_INTERNAL_ERROR);
|
||||
CASE_ENUM_TO_STR(CUFFT_SETUP_FAILED);
|
||||
CASE_ENUM_TO_STR(CUFFT_INVALID_SIZE);
|
||||
default:
|
||||
return "Unknown cufft error status";
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef USE_NCCL
|
||||
template <>
|
||||
const char* CudaErrString<ncclResult_t>(ncclResult_t e) {
|
||||
cudaDeviceSynchronize();
|
||||
return ncclGetErrorString(e);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename ERRTYPE, bool THRW>
|
||||
bool CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg) {
|
||||
if (retCode != successCode) {
|
||||
try {
|
||||
#ifdef _WIN32
|
||||
auto del = [](char* p) { free(p); };
|
||||
std::unique_ptr<char, decltype(del)> hostname_ptr(nullptr, del);
|
||||
size_t hostname_len = 0;
|
||||
char* hostname = nullptr;
|
||||
if (-1 == _dupenv_s(&hostname, &hostname_len, "COMPUTERNAME"))
|
||||
hostname = "?";
|
||||
else
|
||||
hostname_ptr.reset(hostname);
|
||||
#else
|
||||
char hostname[HOST_NAME_MAX];
|
||||
if (gethostname(hostname, HOST_NAME_MAX) != 0)
|
||||
strcpy(hostname, "?");
|
||||
#endif
|
||||
int currentCudaDevice;
|
||||
cudaGetDevice(¤tCudaDevice);
|
||||
cudaGetLastError(); // clear last CUDA error
|
||||
static char str[1024];
|
||||
snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; expr=%s; %s",
|
||||
libName, (int)retCode, CudaErrString(retCode), currentCudaDevice,
|
||||
hostname,
|
||||
exprString, msg);
|
||||
if (THRW) {
|
||||
// throw an exception with the error info
|
||||
ORT_THROW(str);
|
||||
} else {
|
||||
LOGS_DEFAULT(ERROR) << str;
|
||||
}
|
||||
} catch (const std::exception& e) { // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error
|
||||
if (THRW) {
|
||||
ORT_THROW(e.what());
|
||||
} else {
|
||||
LOGS_DEFAULT(ERROR) << e.what();
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template bool CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg);
|
||||
template bool CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg);
|
||||
template bool CudaCall<cublasStatus_t, false>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg);
|
||||
template bool CudaCall<cublasStatus_t, true>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg);
|
||||
template bool CudaCall<cudnnStatus_t, false>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg);
|
||||
template bool CudaCall<cudnnStatus_t, true>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg);
|
||||
template bool CudaCall<curandStatus_t, false>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg);
|
||||
template bool CudaCall<curandStatus_t, true>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg);
|
||||
template bool CudaCall<cufftResult, false>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg);
|
||||
template bool CudaCall<cufftResult, true>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg);
|
||||
|
||||
#ifdef USE_NCCL
|
||||
template bool CudaCall<ncclResult_t, false>(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg);
|
||||
#endif
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/providers/shared_library/provider_api.h"
|
||||
#include "core/providers/cuda/shared_inc/cuda_call.h"
|
||||
|
||||
#include "cuda_allocator.h"
|
||||
#include "cuda_fence.h"
|
||||
#include "gpu_data_transfer.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
static const GPUDataTransfer* GetGPUDataTransfer(const Provider_SessionState* session_state) {
|
||||
OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0);
|
||||
OrtDevice cpu_device;
|
||||
return static_cast<const GPUDataTransfer*>(session_state->GetDataTransferManager().GetProviderDataTransfer(gpu_device, cpu_device));
|
||||
}
|
||||
|
||||
void CUDAAllocator::CheckDevice(bool throw_when_fail) const {
|
||||
#ifndef NDEBUG
|
||||
// check device to match at debug build
|
||||
// if it's expected to change, call cudaSetDevice instead of the check
|
||||
int current_device;
|
||||
auto cuda_err = cudaGetDevice(¤t_device);
|
||||
if (cuda_err == cudaSuccess) {
|
||||
ORT_ENFORCE(current_device == Info().id);
|
||||
} else if (throw_when_fail) {
|
||||
CUDA_CALL_THROW(cuda_err);
|
||||
}
|
||||
#else
|
||||
ORT_UNUSED_PARAMETER(throw_when_fail);
|
||||
#endif
|
||||
}
|
||||
|
||||
void* CUDAAllocator::Alloc(size_t size) {
|
||||
CheckDevice(true);
|
||||
void* p = nullptr;
|
||||
if (size > 0) {
|
||||
//BFCArena was updated recently to handle the exception and adjust the request size
|
||||
CUDA_CALL_THROW(cudaMalloc((void**)&p, size));
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void CUDAAllocator::Free(void* p) {
|
||||
CheckDevice(false); // ignore CUDA failure when free
|
||||
cudaFree(p); // do not throw error since it's OK for cudaFree to fail during shutdown
|
||||
}
|
||||
|
||||
FencePtr CUDAAllocator::CreateFence(const Provider_SessionState* session_state) {
|
||||
return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
|
||||
}
|
||||
|
||||
void* CUDAPinnedAllocator::Alloc(size_t size) {
|
||||
void* p = nullptr;
|
||||
if (size > 0) {
|
||||
CUDA_CALL_THROW(cudaMallocHost((void**)&p, size));
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void CUDAPinnedAllocator::Free(void* p) {
|
||||
CUDA_CALL_THROW(cudaFreeHost(p));
|
||||
}
|
||||
|
||||
FencePtr CUDAPinnedAllocator::CreateFence(const Provider_SessionState* session_state) {
|
||||
return std::make_shared<CUDAFence>(GetGPUDataTransfer(session_state));
|
||||
}
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -1,36 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
class CUDAAllocator : public Provider_IAllocator {
|
||||
public:
|
||||
CUDAAllocator(OrtDevice::DeviceId device_id, const char* name)
|
||||
: Provider_IAllocator(
|
||||
OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
|
||||
OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id),
|
||||
device_id, OrtMemTypeDefault)) {}
|
||||
void* Alloc(size_t size) override;
|
||||
void Free(void* p) override;
|
||||
FencePtr CreateFence(const Provider_SessionState* session_state) override;
|
||||
|
||||
private:
|
||||
void CheckDevice(bool throw_when_fail) const;
|
||||
};
|
||||
|
||||
//TODO: add a default constructor
|
||||
class CUDAPinnedAllocator : public Provider_IAllocator {
|
||||
public:
|
||||
CUDAPinnedAllocator(OrtDevice::DeviceId device_id, const char* name)
|
||||
: Provider_IAllocator(
|
||||
OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
|
||||
OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, device_id),
|
||||
device_id, OrtMemTypeCPUOutput)) {}
|
||||
|
||||
void* Alloc(size_t size) override;
|
||||
void Free(void* p) override;
|
||||
FencePtr CreateFence(const Provider_SessionState* session_state) override;
|
||||
};
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/providers/shared_library/provider_api.h"
|
||||
#include "core/providers/cuda/shared_inc/cuda_call.h"
|
||||
|
||||
#include "cuda_allocator.h"
|
||||
#include "cuda_fence.h"
|
||||
#include "gpu_data_transfer.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
|
||||
|
||||
CUDAFence::CUDAFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) {
|
||||
// NOTE: cudaEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel
|
||||
// if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU
|
||||
int event_flags = /*cudaEventBlockingSync |*/ cudaEventDisableTiming;
|
||||
CUDA_CALL_THROW(cudaEventCreate(&read_event_, event_flags));
|
||||
CUDA_CALL_THROW(cudaEventCreate(&write_event_, event_flags));
|
||||
}
|
||||
|
||||
CUDAFence::~CUDAFence() {
|
||||
CUDA_CALL_THROW(cudaEventDestroy(read_event_));
|
||||
CUDA_CALL_THROW(cudaEventDestroy(write_event_));
|
||||
}
|
||||
|
||||
void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int async_queue_id) {
|
||||
if (provider_type == onnxruntime::kCudaExecutionProvider) {
|
||||
// sync in GPU, the call is non-blocking on CPU
|
||||
CUDA_CALL_THROW(cudaStreamWaitEvent(data_transfer_->GetStream(async_queue_id), write_event_, 0));
|
||||
} else {
|
||||
// sync on CPU for all other providers, this is blocking
|
||||
CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
|
||||
}
|
||||
}
|
||||
|
||||
void CUDAFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) {
|
||||
if (provider_type == onnxruntime::kCudaExecutionProvider) {
|
||||
// sync in GPU, the call is non-blocking on CPU
|
||||
cudaStream_t stream = data_transfer_->GetStream(queue_id);
|
||||
CUDA_CALL_THROW(cudaStreamWaitEvent(stream, read_event_, 0));
|
||||
CUDA_CALL_THROW(cudaStreamWaitEvent(stream, write_event_, 0));
|
||||
} else {
|
||||
// sync on CPU for all other providers, this is blocking
|
||||
CUDA_CALL_THROW(cudaEventSynchronize(read_event_));
|
||||
CUDA_CALL_THROW(cudaEventSynchronize(write_event_));
|
||||
}
|
||||
}
|
||||
|
||||
bool CUDAFence::CanRelease() {
|
||||
return cudaEventQuery(read_event_) == cudaSuccess &&
|
||||
cudaEventQuery(write_event_) == cudaSuccess;
|
||||
}
|
||||
|
||||
void CUDAFence::AfterUsedAsInput(int queue_id) {
|
||||
// update read fence
|
||||
cudaStream_t stream = data_transfer_->GetStream(queue_id);
|
||||
CUDA_CALL_THROW(cudaEventRecord(read_event_, stream));
|
||||
}
|
||||
|
||||
void CUDAFence::AfterUsedAsOutput(int queue_id) {
|
||||
// update write fence
|
||||
cudaStream_t stream = data_transfer_->GetStream(queue_id);
|
||||
CUDA_CALL_THROW(cudaEventRecord(write_event_, stream));
|
||||
}
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace onnxruntime {
|
||||
class GPUDataTransfer;
|
||||
|
||||
class CUDAFence : public IFence {
|
||||
public:
|
||||
CUDAFence(const GPUDataTransfer* data_transfer);
|
||||
virtual ~CUDAFence();
|
||||
virtual void BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int queue_id) override;
|
||||
virtual void BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) override;
|
||||
virtual void AfterUsedAsInput(int queue_id) override;
|
||||
virtual void AfterUsedAsOutput(int queue_id) override;
|
||||
virtual bool CanRelease() override;
|
||||
|
||||
private:
|
||||
cudaEvent_t read_event_;
|
||||
cudaEvent_t write_event_;
|
||||
const GPUDataTransfer* data_transfer_;
|
||||
};
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/providers/shared_library/provider_api.h"
|
||||
#include "core/providers/cuda/shared_inc/cuda_call.h"
|
||||
|
||||
#include "gpu_data_transfer.h"
|
||||
|
||||
#define CUDA_RETURN_IF_ERROR(expr) \
|
||||
ORT_RETURN_IF_ERROR(CUDA_CALL(expr) \
|
||||
? common::Status::OK() \
|
||||
: ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA error executing ", #expr))
|
||||
|
||||
namespace onnxruntime {
|
||||
GPUDataTransfer::GPUDataTransfer() {
|
||||
// create streams, default is nullptr
|
||||
streams_[kCudaStreamDefault] = nullptr;
|
||||
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking));
|
||||
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
GPUDataTransfer::~GPUDataTransfer() {
|
||||
CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn]));
|
||||
CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut]));
|
||||
}
|
||||
|
||||
bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
|
||||
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
|
||||
}
|
||||
|
||||
common::Status GPUDataTransfer::CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const {
|
||||
size_t bytes = src.SizeInBytes();
|
||||
const void* src_data = src.DataRaw();
|
||||
void* dst_data = dst.MutableDataRaw();
|
||||
|
||||
auto& src_device = src.Location().device;
|
||||
auto& dst_device = dst.Location().device;
|
||||
|
||||
if (dst_device.Type() == OrtDevice::GPU) {
|
||||
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
|
||||
// copy from pinned memory to GPU, this is non-blocking
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id]));
|
||||
} else if (src_device.Type() == OrtDevice::GPU) {
|
||||
// copying between GPU, this is non-blocking
|
||||
// Copy only if the two addresses are different.
|
||||
if (dst_data != src_data) {
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault]));
|
||||
}
|
||||
} else {
|
||||
// copy from other CPU memory to GPU, this is blocking
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
|
||||
}
|
||||
} else if (src_device.Type() == OrtDevice::GPU) {
|
||||
if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
|
||||
// copying from GPU to pinned memory, this is non-blocking
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id]));
|
||||
} else {
|
||||
// copying from GPU to CPU memory, this is blocking
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
} else {
|
||||
// copying between cpu memory
|
||||
memcpy(dst_data, src_data, bytes);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
enum CUDAStreamType : int {
|
||||
kCudaStreamDefault = 0,
|
||||
kCudaStreamCopyIn,
|
||||
kCudaStreamCopyOut,
|
||||
kTotalCudaStreams,
|
||||
};
|
||||
|
||||
class GPUDataTransfer : public Provider_IDataTransfer {
|
||||
public:
|
||||
GPUDataTransfer();
|
||||
~GPUDataTransfer();
|
||||
|
||||
bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
|
||||
|
||||
// Suppress MSVC warning about not fully overriding
|
||||
using Provider_IDataTransfer::CopyTensor;
|
||||
common::Status CopyTensor(const Provider_Tensor& src, Provider_Tensor& dst, int exec_queue_id) const override;
|
||||
|
||||
cudaStream_t GetStream(int queue_id) const {
|
||||
ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams);
|
||||
return streams_[queue_id];
|
||||
}
|
||||
|
||||
private:
|
||||
cudaStream_t streams_[kTotalCudaStreams];
|
||||
};
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -2,25 +2,25 @@
|
|||
// Licensed under the MIT License.
|
||||
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include "core/providers/shared_library/provider_api.h"
|
||||
#define ORT_API_MANUAL_INIT
|
||||
#include "core/session/onnxruntime_cxx_api.h"
|
||||
#include "core/common/safeint.h"
|
||||
|
||||
#include "tensorrt_execution_provider.h"
|
||||
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
|
||||
#include "core/providers/cuda/cuda_allocator.h"
|
||||
#include "core/providers/cuda/shared_inc/cuda_call.h"
|
||||
#include "gpu_data_transfer.h"
|
||||
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
|
||||
#include "cuda_runtime_api.h"
|
||||
#include "gsl/gsl"
|
||||
#include <experimental/filesystem>
|
||||
#include "cuda_allocator.h"
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
||||
#define CUDA_RETURN_IF_ERROR(expr) \
|
||||
ORT_RETURN_IF_ERROR(CUDA_CALL(expr) \
|
||||
|
|
@ -69,6 +69,33 @@ struct ShutdownProtobuf {
|
|||
|
||||
namespace onnxruntime {
|
||||
|
||||
namespace cuda {
|
||||
template <>
|
||||
void Impl_Cast(
|
||||
const int64_t* input_data, int32_t* output_data,
|
||||
size_t count) {
|
||||
return g_host->cuda__Impl_Cast(input_data, output_data, count);
|
||||
}
|
||||
|
||||
template <>
|
||||
void Impl_Cast(
|
||||
const int32_t* input_data, int64_t* output_data,
|
||||
size_t count) {
|
||||
return g_host->cuda__Impl_Cast(input_data, output_data, count);
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
|
||||
template <>
|
||||
bool CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) {
|
||||
return g_host->CudaCall_false(retCode, exprString, libName, successCode, msg);
|
||||
}
|
||||
|
||||
template <>
|
||||
bool CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg) {
|
||||
return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg);
|
||||
}
|
||||
|
||||
constexpr const char* TRT = "Tensorrt";
|
||||
constexpr const char* TRT_PINNED = "TensorrtPinned";
|
||||
|
||||
|
|
@ -131,7 +158,7 @@ KernelRegistryAndStatus GetTensorrtKernelRegistry() {
|
|||
}
|
||||
|
||||
std::shared_ptr<Provider_KernelRegistry> TensorrtExecutionProvider::Provider_GetKernelRegistry() const {
|
||||
static KernelRegistryAndStatus k = GetTensorrtKernelRegistry();
|
||||
static KernelRegistryAndStatus k = onnxruntime::GetTensorrtKernelRegistry();
|
||||
// throw if the registry failed to initialize
|
||||
ORT_THROW_IF_ERROR(k.st);
|
||||
return k.kernel_registry;
|
||||
|
|
@ -148,12 +175,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
|
|||
CUDA_CALL_THROW(cudaSetDevice(device_id_));
|
||||
|
||||
Provider_AllocatorCreationInfo default_memory_info(
|
||||
[](int id) { return onnxruntime::make_unique<CUDAAllocator>(id, TRT); }, device_id_);
|
||||
[](int id) { return Provider_CreateCUDAAllocator(id, TRT); }, device_id_);
|
||||
allocator_ = CreateAllocator(default_memory_info);
|
||||
Provider_InsertAllocator(allocator_);
|
||||
|
||||
Provider_AllocatorCreationInfo pinned_allocator_info(
|
||||
[](int) { return onnxruntime::make_unique<CUDAPinnedAllocator>(0, TRT_PINNED); }, device_id_);
|
||||
[](int) { return Provider_CreateCUDAPinnedAllocator(0, TRT_PINNED); }, device_id_);
|
||||
Provider_InsertAllocator(CreateAllocator(pinned_allocator_info));
|
||||
|
||||
// Get environment variables
|
||||
|
|
@ -209,7 +236,7 @@ Provider_AllocatorPtr TensorrtExecutionProvider::Provider_GetAllocator(int id, O
|
|||
}
|
||||
|
||||
std::unique_ptr<onnxruntime::Provider_IDataTransfer> TensorrtExecutionProvider::Provider_GetDataTransfer() const {
|
||||
return onnxruntime::make_unique<GPUDataTransfer>();
|
||||
return onnxruntime::Provider_CreateGPUDataTransfer();
|
||||
}
|
||||
|
||||
// Convert GraphViewer graph to GraphProto
|
||||
|
|
@ -1345,5 +1372,4 @@ common::Status TensorrtExecutionProvider::Provider_Compile(const std::vector<onn
|
|||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -2,14 +2,6 @@
|
|||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
#pragma warning(disable : 4244) // '=': conversion from '_Ty2' to '_Ty', possible loss of data
|
||||
#pragma warning(disable : 4267) // 'initializing': conversion from '<type1>' to '<type2>', possible loss of data
|
||||
#pragma warning(disable : 4996) // 'gmtime': This function or variable may be unsafe. Consider using gmtime_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
|
||||
#pragma warning(disable : 4456) // declaration of 'i' hides previous local declaration
|
||||
#endif
|
||||
|
||||
#include <ctime>
|
||||
#include "NvInfer.h"
|
||||
#include "NvOnnxParser.h"
|
||||
|
|
|
|||
Loading…
Reference in a new issue