diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.cc b/onnxruntime/core/providers/cuda/cuda_profiler.cc index 89ffce9f0e..d7258313c0 100644 --- a/onnxruntime/core/providers/cuda/cuda_profiler.cc +++ b/onnxruntime/core/providers/cuda/cuda_profiler.cc @@ -16,6 +16,8 @@ std::atomic_flag CudaProfiler::enabled{0}; std::vector CudaProfiler::stats; std::unordered_map CudaProfiler::id_map; +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + #define BUF_SIZE (32 * 1024) #define ALIGN_SIZE (8) #define ALIGN_BUFFER(buffer, align) \ @@ -62,7 +64,7 @@ void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer do { status = cuptiActivityGetNextRecord(buffer, validSize, &record); if (status == CUPTI_SUCCESS) { - if (CUPTI_ACTIVITY_KIND_KERNEL == record->kind) { + if (CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL == record->kind) { CUpti_ActivityKernel3* kernel = (CUpti_ActivityKernel3*)record; stats.push_back({kernel->name, kernel->streamId, kernel->gridX, kernel->gridY, kernel->gridZ, @@ -93,7 +95,7 @@ bool CudaProfiler::StartProfiling() { if (!enabled.test_and_set()) { if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME) == CUPTI_SUCCESS && cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER) == CUPTI_SUCCESS && - cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) == CUPTI_SUCCESS && + cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) == CUPTI_SUCCESS && cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY) == CUPTI_SUCCESS && cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION) == CUPTI_SUCCESS && cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted) == CUPTI_SUCCESS) { @@ -179,7 +181,7 @@ void CudaProfiler::Stop(uint64_t) { void CudaProfiler::DisableEvents() { cuptiActivityDisable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION); - cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL); + cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY); cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER); cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME); @@ -194,6 +196,20 @@ void CudaProfiler::Clear() { } } +#else // for cuda 10.x, no profiling + +void CUPTIAPI CudaProfiler::BufferRequested(uint8_t**, size_t*, size_t*) {} +void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t*, size_t, size_t) {} +bool CudaProfiler::StartProfiling() { return false; } +void CudaProfiler::EndProfiling(TimePoint, Events&) {} +CudaProfiler::~CudaProfiler() {} +void CudaProfiler::Start(uint64_t) {} +void CudaProfiler::Stop(uint64_t) {} +void CudaProfiler::DisableEvents() {} +void CudaProfiler::Clear() {} + +#endif + } // namespace profiling } // namespace onnxruntime #endif \ No newline at end of file diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.h b/onnxruntime/core/providers/cuda/cuda_profiler.h index b4eb314f49..881f150f23 100644 --- a/onnxruntime/core/providers/cuda/cuda_profiler.h +++ b/onnxruntime/core/providers/cuda/cuda_profiler.h @@ -2,23 +2,7 @@ // Licensed under the MIT License. #include "core/common/profiler_common.h" -#if defined(USE_ROCM) || defined(ENABLE_TRAINING) -namespace onnxruntime { - -namespace profiling { - -class CudaProfiler final : public EpProfiler { - public: - bool StartProfiling() override { return true; } - void EndProfiling(TimePoint, Events&) override{}; - void Start(uint64_t) override{}; - void Stop(uint64_t) override{}; -}; - -} -} - -#else +#if !(defined(USE_ROCM) || defined(ENABLE_TRAINING)) #include "core/platform/ort_mutex.h" #include @@ -78,4 +62,22 @@ class CudaProfiler final : public EpProfiler { } // namespace profiling } // namespace onnxruntime + +#else + +namespace onnxruntime { + +namespace profiling { + +class CudaProfiler final : public EpProfiler { + public: + bool StartProfiling() override { return true; } + void EndProfiling(TimePoint, Events&) override{}; + void Start(uint64_t) override{}; + void Stop(uint64_t) override{}; +}; + +} +} + #endif \ No newline at end of file diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index ebad35c608..c32ebca5ec 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -653,7 +653,7 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) { } } -#if defined(USE_CUDA) && !defined(ENABLE_TRAINING) +#if defined(USE_CUDA) && !defined(ENABLE_TRAINING) && defined(CUDA_VERSION) && CUDA_VERSION >= 11000 ASSERT_TRUE(has_kernel_info); #endif }