diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h b/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h index 35a179b0f1..10a8d76205 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h +++ b/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h @@ -4,9 +4,15 @@ #include "onnxruntime_c_api.h" #ifdef __cplusplus +#include "core/framework/provider_options.h" + namespace onnxruntime { class IAllocator; class IDataTransfer; +struct IExecutionProviderFactory; +struct CUDAExecutionProviderInfo; +enum class ArenaExtendStrategy : int32_t; +struct CUDAExecutionProviderExternalAllocatorInfo; } // namespace onnxruntime struct ProviderInfo_CUDA { @@ -25,6 +31,12 @@ struct ProviderInfo_CUDA { virtual void CopyGpuToCpu(void* dst_ptr, const void* src_ptr, const size_t size, const OrtMemoryInfo& dst_location, const OrtMemoryInfo& src_location) = 0; virtual void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) = 0; + virtual void cudaMemcpy_DeviceToHost(void* dst, const void* src, size_t count) = 0; + virtual int cudaGetDeviceCount() = 0; + virtual void CUDAExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::CUDAExecutionProviderInfo& info) = 0; + + virtual std::shared_ptr CreateExecutionProviderFactory(const onnxruntime::CUDAExecutionProviderInfo& info) = 0; + virtual std::shared_ptr CreateCudaAllocator(int16_t device_id, size_t gpu_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo& external_allocator_info) = 0; }; extern "C" { diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index 9ccc7209ad..6c656c018c 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -44,7 +44,7 @@ namespace onnxruntime { struct ProviderInfo_CUDA_Impl : ProviderInfo_CUDA { OrtStatus* SetCurrentGpuDeviceId(_In_ int device_id) override { int num_devices; - auto cuda_err = cudaGetDeviceCount(&num_devices); + auto cuda_err = ::cudaGetDeviceCount(&num_devices); if (cuda_err != cudaSuccess) { return CreateStatus(ORT_FAIL, "Failed to set device id since cudaGetDeviceCount failed."); } @@ -113,8 +113,28 @@ struct ProviderInfo_CUDA_Impl : ProviderInfo_CUDA { } } - // Used only by slice_concatenate_test.cc - void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) override { cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice); } + // Used by slice_concatenate_test.cc and onnxruntime_pybind_state.cc + void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) override { CUDA_CALL_THROW(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice)); } + // Used by onnxruntime_pybind_state.cc + void cudaMemcpy_DeviceToHost(void* dst, const void* src, size_t count) override { CUDA_CALL_THROW(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost)); } + + int cudaGetDeviceCount() override { + int num_devices = 0; + CUDA_CALL_THROW(::cudaGetDeviceCount(&num_devices)); + return num_devices; + } + + void CUDAExecutionProviderInfo__FromProviderOptions(const ProviderOptions& options, CUDAExecutionProviderInfo& info) { + info = CUDAExecutionProviderInfo::FromProviderOptions(options); + } + + std::shared_ptr CreateExecutionProviderFactory(const CUDAExecutionProviderInfo& info) override { + return std::make_shared(info); + } + + std::shared_ptr CreateCudaAllocator(int16_t device_id, size_t gpu_mem_limit, onnxruntime::ArenaExtendStrategy arena_extend_strategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo& external_allocator_info) override { + return CUDAExecutionProvider::CreateCudaAllocator(device_id, gpu_mem_limit, arena_extend_strategy, external_allocator_info); + } } g_info; diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index 7a6ad42177..53e6e68c2a 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -454,8 +454,6 @@ Tensor* AttentionBase::GetPresent(OpKernelContext* context, const Tensor* past, return g_host->AttentionBase__GetPresent(this, context, past, batch_size, head_size, sequence_length, past_sequence_length); } -Status YieldOp::Compute(OpKernelContext* context) const { return g_host->YieldOp__Compute(this, context); } - } // namespace contrib #endif @@ -467,6 +465,7 @@ std::unique_ptr Loop::Create(const OpKernelInfo& info, const Loop::Con namespace contrib { Status Group::Compute(OpKernelContext* context) const { return g_host->contrib__Group__Compute(this, context); } Status PassThrough::Compute(OpKernelContext* context) const { return g_host->contrib__PassThrough__Compute(this, context); } +Status YieldOp::Compute(OpKernelContext* context) const { return g_host->YieldOp__Compute(this, context); } } // namespace contrib #endif diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 64071a89bc..2219dd2282 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -32,9 +32,6 @@ // execution provider factory creator headers #include "core/providers/cpu/cpu_provider_factory_creator.h" -#ifdef USE_CUDA -#include "core/providers/cuda/cuda_provider_factory_creator.h" -#endif #ifdef USE_ROCM #include "core/providers/rocm/rocm_provider_factory_creator.h" #endif @@ -165,6 +162,9 @@ size_t gpu_mem_limit = std::numeric_limits::max(); onnxruntime::ArenaExtendStrategy arena_extend_strategy = onnxruntime::ArenaExtendStrategy::kNextPowerOfTwo; #endif +#ifdef USE_CUDA +#include "core/providers/cuda/cuda_provider_factory.h" +#endif #ifdef USE_TENSORRT #include "core/providers/tensorrt/tensorrt_provider_factory.h" #endif @@ -203,10 +203,14 @@ const OrtDevice::DeviceType OrtDevice::GPU; namespace onnxruntime { std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params); std::shared_ptr CreateExecutionProviderFactory_MIGraphX(int device_id); +std::shared_ptr CreateExecutionProviderFactory_Cuda(const OrtCUDAProviderOptions* params); std::shared_ptr CreateExecutionProviderFactory_Dnnl(int use_arena); std::shared_ptr CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params); +#ifdef USE_CUDA +ProviderInfo_CUDA* GetProviderInfo_CUDA(); +#endif #ifdef USE_OPENVINO -const ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO(); +ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO(); #endif std::shared_ptr CreateExecutionProviderFactory_Nuphar(bool, const char*); std::shared_ptr CreateExecutionProviderFactory_VITISAI(const char* backend_type, int device_id); @@ -443,8 +447,7 @@ static inline void RegisterExecutionProvider(InferenceSession* sess, onnxruntime #ifdef USE_CUDA static bool IsCudaDeviceIdValid(const onnxruntime::logging::Logger& logger, int id) { - int num_devices = 0; - CUDA_CALL_THROW(cudaGetDeviceCount(&num_devices)); + int num_devices = GetProviderInfo_CUDA()->cudaGetDeviceCount(); if (0 == num_devices) { LOGS(logger, WARNING) << "your system does not have a CUDA capable device."; @@ -465,18 +468,18 @@ static AllocatorPtr GetCudaAllocator(OrtDevice::DeviceId id) { static std::unordered_map id_to_allocator_map; if (id_to_allocator_map.find(id) == id_to_allocator_map.end()) { - id_to_allocator_map.insert({id, CUDAExecutionProvider::CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info)}); + id_to_allocator_map.insert({id, GetProviderInfo_CUDA()->CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info)}); } return id_to_allocator_map[id]; } static void CpuToCudaMemCpy(void* dst, const void* src, size_t num_bytes) { - CUDA_CALL_THROW(cudaMemcpy(dst, src, num_bytes, cudaMemcpyHostToDevice)); + GetProviderInfo_CUDA()->cudaMemcpy_HostToDevice(dst, src, num_bytes); } static void CudaToCpuMemCpy(void* dst, const void* src, size_t num_bytes) { - CUDA_CALL_THROW(cudaMemcpy(dst, src, num_bytes, cudaMemcpyDeviceToHost)); + GetProviderInfo_CUDA()->cudaMemcpy_DeviceToHost(dst, src, num_bytes); } static const std::unordered_map* GetCudaToHostMemCpyFunction() { @@ -613,26 +616,23 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector } else if (type == kCudaExecutionProvider) { #ifdef USE_CUDA const auto it = provider_options_map.find(type); - const CUDAExecutionProviderInfo info = - it != provider_options_map.end() - ? CUDAExecutionProviderInfo::FromProviderOptions(it->second) - : [&]() { - CUDAExecutionProviderInfo info{}; - info.device_id = cuda_device_id; - info.gpu_mem_limit = gpu_mem_limit; - info.arena_extend_strategy = arena_extend_strategy; - info.cudnn_conv_algo_search = cudnn_conv_algo_search; - info.do_copy_in_default_stream = do_copy_in_default_stream; - info.external_allocator_info = external_allocator_info; - return info; - }(); + CUDAExecutionProviderInfo info{}; + if (it != provider_options_map.end()) + GetProviderInfo_CUDA()->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info); + else { + info.device_id = cuda_device_id; + info.gpu_mem_limit = gpu_mem_limit; + info.arena_extend_strategy = arena_extend_strategy; + info.cudnn_conv_algo_search = cudnn_conv_algo_search; + info.do_copy_in_default_stream = do_copy_in_default_stream; + info.external_allocator_info = external_allocator_info; + } // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still // exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can // since FromProviderOptions might contain external CUDA allocator. external_allocator_info = info.external_allocator_info; - RegisterExecutionProvider( - sess, *onnxruntime::CreateExecutionProviderFactory_CUDA(info)); + RegisterExecutionProvider(sess, *GetProviderInfo_CUDA()->CreateExecutionProviderFactory(info)); #endif } else if (type == kRocmExecutionProvider) { #ifdef USE_ROCM @@ -649,7 +649,7 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector return info; }(); - // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still + // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still // exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can // since FromProviderOptions might contain external CUDA allocator. external_allocator_info = info.external_allocator_info;