Fix python

This commit is contained in:
Ryan Hill 2021-04-13 20:28:03 -07:00
parent 683354424a
commit 4cf4cf3032
4 changed files with 61 additions and 30 deletions

View file

@ -4,9 +4,15 @@
#include "onnxruntime_c_api.h"
#ifdef __cplusplus
#include "core/framework/provider_options.h"
namespace onnxruntime {
class IAllocator;
class IDataTransfer;
struct IExecutionProviderFactory;
struct CUDAExecutionProviderInfo;
enum class ArenaExtendStrategy : int32_t;
struct CUDAExecutionProviderExternalAllocatorInfo;
} // namespace onnxruntime
struct ProviderInfo_CUDA {
@ -25,6 +31,12 @@ struct ProviderInfo_CUDA {
virtual void CopyGpuToCpu(void* dst_ptr, const void* src_ptr, const size_t size, const OrtMemoryInfo& dst_location, const OrtMemoryInfo& src_location) = 0;
virtual void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) = 0;
virtual void cudaMemcpy_DeviceToHost(void* dst, const void* src, size_t count) = 0;
virtual int cudaGetDeviceCount() = 0;
virtual void CUDAExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::CUDAExecutionProviderInfo& info) = 0;
virtual std::shared_ptr<onnxruntime::IExecutionProviderFactory> CreateExecutionProviderFactory(const onnxruntime::CUDAExecutionProviderInfo& info) = 0;
virtual std::shared_ptr<onnxruntime::IAllocator> CreateCudaAllocator(int16_t device_id, size_t gpu_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo& external_allocator_info) = 0;
};
extern "C" {

View file

@ -44,7 +44,7 @@ namespace onnxruntime {
struct ProviderInfo_CUDA_Impl : ProviderInfo_CUDA {
OrtStatus* SetCurrentGpuDeviceId(_In_ int device_id) override {
int num_devices;
auto cuda_err = cudaGetDeviceCount(&num_devices);
auto cuda_err = ::cudaGetDeviceCount(&num_devices);
if (cuda_err != cudaSuccess) {
return CreateStatus(ORT_FAIL, "Failed to set device id since cudaGetDeviceCount failed.");
}
@ -113,8 +113,28 @@ struct ProviderInfo_CUDA_Impl : ProviderInfo_CUDA {
}
}
// Used only by slice_concatenate_test.cc
void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) override { cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice); }
// Used by slice_concatenate_test.cc and onnxruntime_pybind_state.cc
void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) override { CUDA_CALL_THROW(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice)); }
// Used by onnxruntime_pybind_state.cc
void cudaMemcpy_DeviceToHost(void* dst, const void* src, size_t count) override { CUDA_CALL_THROW(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost)); }
int cudaGetDeviceCount() override {
int num_devices = 0;
CUDA_CALL_THROW(::cudaGetDeviceCount(&num_devices));
return num_devices;
}
void CUDAExecutionProviderInfo__FromProviderOptions(const ProviderOptions& options, CUDAExecutionProviderInfo& info) {
info = CUDAExecutionProviderInfo::FromProviderOptions(options);
}
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const CUDAExecutionProviderInfo& info) override {
return std::make_shared<CUDAProviderFactory>(info);
}
std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t device_id, size_t gpu_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo& external_allocator_info) override {
return CUDAExecutionProvider::CreateCudaAllocator(device_id, gpu_mem_limit, arena_extend_strategy, external_allocator_info);
}
} g_info;

View file

@ -454,8 +454,6 @@ Tensor* AttentionBase::GetPresent(OpKernelContext* context, const Tensor* past,
return g_host->AttentionBase__GetPresent(this, context, past, batch_size, head_size, sequence_length, past_sequence_length);
}
Status YieldOp::Compute(OpKernelContext* context) const { return g_host->YieldOp__Compute(this, context); }
} // namespace contrib
#endif
@ -467,6 +465,7 @@ std::unique_ptr<OpKernel> Loop::Create(const OpKernelInfo& info, const Loop::Con
namespace contrib {
Status Group::Compute(OpKernelContext* context) const { return g_host->contrib__Group__Compute(this, context); }
Status PassThrough::Compute(OpKernelContext* context) const { return g_host->contrib__PassThrough__Compute(this, context); }
Status YieldOp::Compute(OpKernelContext* context) const { return g_host->YieldOp__Compute(this, context); }
} // namespace contrib
#endif

View file

@ -32,9 +32,6 @@
// execution provider factory creator headers
#include "core/providers/cpu/cpu_provider_factory_creator.h"
#ifdef USE_CUDA
#include "core/providers/cuda/cuda_provider_factory_creator.h"
#endif
#ifdef USE_ROCM
#include "core/providers/rocm/rocm_provider_factory_creator.h"
#endif
@ -165,6 +162,9 @@ size_t gpu_mem_limit = std::numeric_limits<size_t>::max();
onnxruntime::ArenaExtendStrategy arena_extend_strategy = onnxruntime::ArenaExtendStrategy::kNextPowerOfTwo;
#endif
#ifdef USE_CUDA
#include "core/providers/cuda/cuda_provider_factory.h"
#endif
#ifdef USE_TENSORRT
#include "core/providers/tensorrt/tensorrt_provider_factory.h"
#endif
@ -203,10 +203,14 @@ const OrtDevice::DeviceType OrtDevice::GPU;
namespace onnxruntime {
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(int device_id);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Cuda(const OrtCUDAProviderOptions* params);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params);
#ifdef USE_CUDA
ProviderInfo_CUDA* GetProviderInfo_CUDA();
#endif
#ifdef USE_OPENVINO
const ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO();
ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO();
#endif
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nuphar(bool, const char*);
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_VITISAI(const char* backend_type, int device_id);
@ -443,8 +447,7 @@ static inline void RegisterExecutionProvider(InferenceSession* sess, onnxruntime
#ifdef USE_CUDA
static bool IsCudaDeviceIdValid(const onnxruntime::logging::Logger& logger, int id) {
int num_devices = 0;
CUDA_CALL_THROW(cudaGetDeviceCount(&num_devices));
int num_devices = GetProviderInfo_CUDA()->cudaGetDeviceCount();
if (0 == num_devices) {
LOGS(logger, WARNING) << "your system does not have a CUDA capable device.";
@ -465,18 +468,18 @@ static AllocatorPtr GetCudaAllocator(OrtDevice::DeviceId id) {
static std::unordered_map<OrtDevice::DeviceId, AllocatorPtr> id_to_allocator_map;
if (id_to_allocator_map.find(id) == id_to_allocator_map.end()) {
id_to_allocator_map.insert({id, CUDAExecutionProvider::CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info)});
id_to_allocator_map.insert({id, GetProviderInfo_CUDA()->CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info)});
}
return id_to_allocator_map[id];
}
static void CpuToCudaMemCpy(void* dst, const void* src, size_t num_bytes) {
CUDA_CALL_THROW(cudaMemcpy(dst, src, num_bytes, cudaMemcpyHostToDevice));
GetProviderInfo_CUDA()->cudaMemcpy_HostToDevice(dst, src, num_bytes);
}
static void CudaToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
CUDA_CALL_THROW(cudaMemcpy(dst, src, num_bytes, cudaMemcpyDeviceToHost));
GetProviderInfo_CUDA()->cudaMemcpy_DeviceToHost(dst, src, num_bytes);
}
static const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpyFunction() {
@ -613,26 +616,23 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
} else if (type == kCudaExecutionProvider) {
#ifdef USE_CUDA
const auto it = provider_options_map.find(type);
const CUDAExecutionProviderInfo info =
it != provider_options_map.end()
? CUDAExecutionProviderInfo::FromProviderOptions(it->second)
: [&]() {
CUDAExecutionProviderInfo info{};
info.device_id = cuda_device_id;
info.gpu_mem_limit = gpu_mem_limit;
info.arena_extend_strategy = arena_extend_strategy;
info.cudnn_conv_algo_search = cudnn_conv_algo_search;
info.do_copy_in_default_stream = do_copy_in_default_stream;
info.external_allocator_info = external_allocator_info;
return info;
}();
CUDAExecutionProviderInfo info{};
if (it != provider_options_map.end())
GetProviderInfo_CUDA()->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info);
else {
info.device_id = cuda_device_id;
info.gpu_mem_limit = gpu_mem_limit;
info.arena_extend_strategy = arena_extend_strategy;
info.cudnn_conv_algo_search = cudnn_conv_algo_search;
info.do_copy_in_default_stream = do_copy_in_default_stream;
info.external_allocator_info = external_allocator_info;
}
// This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
// exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
// since FromProviderOptions might contain external CUDA allocator.
external_allocator_info = info.external_allocator_info;
RegisterExecutionProvider(
sess, *onnxruntime::CreateExecutionProviderFactory_CUDA(info));
RegisterExecutionProvider(sess, *GetProviderInfo_CUDA()->CreateExecutionProviderFactory(info));
#endif
} else if (type == kRocmExecutionProvider) {
#ifdef USE_ROCM
@ -649,7 +649,7 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
return info;
}();
// This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
// This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
// exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
// since FromProviderOptions might contain external CUDA allocator.
external_allocator_info = info.external_allocator_info;