From 6a889ee8bf72f34f44b8a4642728b7bd83f61255 Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Wed, 19 May 2021 13:00:37 -0700 Subject: [PATCH] Remove cupti library dependency from core library --- cmake/onnxruntime_common.cmake | 2 -- cmake/onnxruntime_providers.cmake | 5 +++ .../providers/cuda/cuda_provider_factory.h | 5 +++ .../core/framework/provider_bridge_ort.cc | 34 +++++++++++++++++++ .../providers/cuda/cuda_provider_factory.cc | 17 ++++++++++ 5 files changed, 61 insertions(+), 2 deletions(-) diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake index 61af21ab36..10010299da 100644 --- a/cmake/onnxruntime_common.cmake +++ b/cmake/onnxruntime_common.cmake @@ -92,8 +92,6 @@ onnxruntime_add_static_library(onnxruntime_common ${onnxruntime_common_src}) if (onnxruntime_USE_CUDA) target_include_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include) - target_link_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64) - target_link_libraries(onnxruntime_common cupti) endif() if (onnxruntime_USE_TELEMETRY) diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index a74633ae7d..c72b69eded 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -316,6 +316,11 @@ if (onnxruntime_USE_CUDA) set_target_properties(onnxruntime_providers_cuda PROPERTIES LINKER_LANGUAGE CUDA) set_target_properties(onnxruntime_providers_cuda PROPERTIES FOLDER "ONNXRuntime") + # Add in Cupti profiling dependency + target_include_directories(onnxruntime_providers_cuda PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include) + target_link_directories(onnxruntime_providers_cuda PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64) + target_link_libraries(onnxruntime_providers_cuda PRIVATE cupti) + if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11) target_include_directories(onnxruntime_providers_cuda PRIVATE ${PROJECT_SOURCE_DIR}/external/cub) endif() diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h b/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h index 6fcf582684..532200280f 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h +++ b/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h @@ -40,6 +40,11 @@ struct ProviderInfo_CUDA { virtual int cudaGetDeviceCount() = 0; virtual void CUDAExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::CUDAExecutionProviderInfo& info) = 0; + virtual int cuptiActivityEnable(int kind) = 0; + virtual int cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes, void* record) = 0; + virtual int cuptiActivityRegisterCallbacks(void* funcBufferRequested, void* funcBufferCompleted) = 0; + virtual int cuptiActivityFlushAll(uint32_t flag) = 0; + #if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) virtual onnxruntime::cuda::INcclService& GetINcclService() = 0; #endif diff --git a/onnxruntime/core/framework/provider_bridge_ort.cc b/onnxruntime/core/framework/provider_bridge_ort.cc index 0d26c876ab..1bec9536a4 100644 --- a/onnxruntime/core/framework/provider_bridge_ort.cc +++ b/onnxruntime/core/framework/provider_bridge_ort.cc @@ -54,6 +54,10 @@ Status LongformerAttentionBase__CheckInputs(const LongformerAttentionBase* p, co #include "contrib_ops/cpu/bert/attention_base.h" #endif +#ifdef USE_CUDA +#include +#endif + #ifdef ENABLE_TRAINING #include "orttraining/training_ops/cpu/aten_ops/aten_op.h" #include "orttraining/training_ops/cpu/controlflow/group.h" @@ -1081,6 +1085,36 @@ INcclService& INcclService::GetInstance() { } // namespace onnxruntime +#if defined(USE_CUDA) +CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind) { + auto* info = onnxruntime::GetProviderInfo_CUDA(); + if (info) + return CUptiResult(info->cuptiActivityEnable(kind)); + return CUPTI_ERROR_NOT_SUPPORTED; +} + +CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes, CUpti_Activity** record) { + auto* info = onnxruntime::GetProviderInfo_CUDA(); + if (info) + return CUptiResult(info->cuptiActivityGetNextRecord(buffer, validBufferSizeBytes, record)); + return CUPTI_ERROR_NOT_SUPPORTED; +} + +CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested, CUpti_BuffersCallbackCompleteFunc funcBufferCompleted) { + auto* info = onnxruntime::GetProviderInfo_CUDA(); + if (info) + return CUptiResult(info->cuptiActivityRegisterCallbacks(funcBufferRequested, funcBufferCompleted)); + return CUPTI_ERROR_NOT_SUPPORTED; +} + +CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag) { + auto* info = onnxruntime::GetProviderInfo_CUDA(); + if (info) + return CUptiResult(info->cuptiActivityFlushAll(flag)); + return CUPTI_ERROR_NOT_SUPPORTED; +} +#endif + ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Dnnl, _In_ OrtSessionOptions* options, int use_arena) { auto factory = onnxruntime::CreateExecutionProviderFactory_Dnnl(use_arena); if (!factory) { diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index 20906f349c..bf977a3f3f 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -6,6 +6,7 @@ #include "core/providers/cuda/cuda_provider_factory.h" #include +#include #include "gsl/gsl" @@ -132,6 +133,22 @@ struct ProviderInfo_CUDA_Impl : ProviderInfo_CUDA { info = CUDAExecutionProviderInfo::FromProviderOptions(options); } + int cuptiActivityEnable(int kind) override { + return ::cuptiActivityEnable(CUpti_ActivityKind(kind)); + } + + int cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes, void* record) override { + return ::cuptiActivityGetNextRecord(buffer, validBufferSizeBytes, reinterpret_cast(record)); + } + + int cuptiActivityRegisterCallbacks(void* funcBufferRequested, void* funcBufferCompleted) override { + return ::cuptiActivityRegisterCallbacks(reinterpret_cast(funcBufferRequested), reinterpret_cast(funcBufferCompleted)); + } + + int cuptiActivityFlushAll(uint32_t flag) override { + return ::cuptiActivityFlushAll(flag); + } + #if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) cuda::INcclService& GetINcclService() override { return cuda::GetINcclService();