From aeca7c29408f03ad9e19837085156cba3c117263 Mon Sep 17 00:00:00 2001 From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com> Date: Mon, 29 Mar 2021 12:04:36 -0700 Subject: [PATCH] Cuda Profiler (#7110) * implement cuda profiler * add counters * downgrade cupti kernel version * move mutex * add cupti to path * fix win gpu build err * add path for cuda10 * fix linux com err * extend include path * add init flag * fix test case * fix tensorrt pipeline * add UT Co-authored-by: Ubuntu --- cmake/onnxruntime_common.cmake | 6 + .../onnxruntime/core/common/logging/logging.h | 4 +- onnxruntime/core/common/profiler.cc | 154 +++++++++++++++++- .../test/framework/inference_session_test.cc | 68 ++++---- .../test/python/onnxruntime_test_python.py | 6 +- .../github/windows/setup_env_cuda.bat | 2 +- .../github/windows/setup_env_cuda_11.bat | 2 +- .../ci_build/github/windows/setup_env_trt.bat | 2 +- 8 files changed, 207 insertions(+), 37 deletions(-) diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake index c9dd4326cc..d414dd1f51 100644 --- a/cmake/onnxruntime_common.cmake +++ b/cmake/onnxruntime_common.cmake @@ -79,6 +79,12 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_common_src}) add_library(onnxruntime_common ${onnxruntime_common_src}) +if (onnxruntime_USE_CUDA) + target_include_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include) + target_link_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64) + target_link_libraries(onnxruntime_common cupti) +endif() + if (onnxruntime_USE_TELEMETRY) set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h") endif() diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h index 96c07f09c4..95b6a4aa6d 100644 --- a/include/onnxruntime/core/common/logging/logging.h +++ b/include/onnxruntime/core/common/logging/logging.h @@ -55,6 +55,7 @@ namespace profiling { enum EventCategory { SESSION_EVENT = 0, NODE_EVENT, + KERNEL_EVENT, EVENT_CATEGORY_MAX }; @@ -63,7 +64,8 @@ Event descriptions for the above session events. */ static constexpr const char* event_categor_names_[EVENT_CATEGORY_MAX] = { "Session", - "Node"}; + "Node", + "Kernel"}; /* Timing record for all events. diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc index ee752a6a62..3679501a47 100644 --- a/onnxruntime/core/common/profiler.cc +++ b/onnxruntime/core/common/profiler.cc @@ -2,11 +2,148 @@ // Licensed under the MIT License. #include "profiler.h" +#include + +#ifdef USE_CUDA +#include +#endif namespace onnxruntime { namespace profiling { using namespace std::chrono; +class DeviceProfiler { + public: + static DeviceProfiler* GetDeviceProfiler(); + virtual void StartProfiling(TimePoint start_time, int pid, int tid) = 0; + virtual std::vector EndProfiling() = 0; + virtual ~DeviceProfiler() = default; +}; + +#ifdef USE_CUDA +#define BUF_SIZE (32 * 1024) +#define ALIGN_SIZE (8) +#define ALIGN_BUFFER(buffer, align) \ + (((uintptr_t)(buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) : (buffer)) +#define DUR(s, e) std::lround(static_cast(e - s) / 1000) + +class CudaProfiler final: public DeviceProfiler { + public: + friend class DeviceProfiler; + ~CudaProfiler() = default; + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler); + void StartProfiling(TimePoint start_time, int pid, int tid) override; + std::vector EndProfiling() override; + private: + CudaProfiler() = default; + static void CUPTIAPI BufferRequested(uint8_t**, size_t*, size_t*); + static void CUPTIAPI BufferCompleted(CUcontext, uint32_t, uint8_t*, size_t, size_t); + struct KernelStat { + std::string name_ = {}; + uint32_t stream_ = 0; + int32_t grid_x_ = 0; + int32_t grid_y_ = 0; + int32_t grid_z_ = 0; + int32_t block_x_ = 0; + int32_t block_y_ = 0; + int32_t block_z_ = 0; + int64_t start_ = 0; + int64_t stop_ = 0; + }; + static OrtMutex mutex_; + static std::vector stats_; + bool initialized_ = false; + TimePoint start_time_; + int pid_ = 0; + int tid_ = 0; + static std::atomic_flag enabled_; +}; + +OrtMutex CudaProfiler::mutex_; +std::vector CudaProfiler::stats_; +std::atomic_flag CudaProfiler::enabled_; + +void CUPTIAPI CudaProfiler::BufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { + uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE); + ORT_ENFORCE(bfr, "Failed to allocate memory for cuda kernel profiling."); + *size = BUF_SIZE; + *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE); + *maxNumRecords = 0; +} + +void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) { + CUptiResult status; + CUpti_Activity* record = NULL; + if (validSize > 0) { + std::unique_lock lock(mutex_); + do { + status = cuptiActivityGetNextRecord(buffer, validSize, &record); + if (status == CUPTI_SUCCESS) { + if (CUPTI_ACTIVITY_KIND_KERNEL == record->kind) { + CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4*)record; + stats_.push_back({kernel->name, kernel->streamId, + kernel->gridX, kernel->gridY, kernel->gridZ, + kernel->blockX, kernel->blockY, kernel->blockZ, + static_cast(kernel->start), + static_cast(kernel->end)}); + } + } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { + break; + } + } while (1); + } + free(buffer); +} + +void CudaProfiler::StartProfiling(TimePoint start_time, int pid, int tid) { + if (!enabled_.test_and_set()) { + start_time_ = start_time; + pid_ = pid; + tid_ = tid; + if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) == CUPTI_SUCCESS && + cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted) == CUPTI_SUCCESS) { + initialized_ = true; + } + } +} + +std::vector CudaProfiler::EndProfiling() { + std::vector events; + if (enabled_.test_and_set()) { + if (initialized_) { + cuptiActivityFlushAll(1); + std::unique_lock lock(mutex_); + int64_t profiling_start = std::chrono::duration_cast(start_time_.time_since_epoch()).count(); + for (const auto& stat : stats_) { + std::initializer_list> args = {{"stream", std::to_string(stat.stream_)}, + {"grid_x", std::to_string(stat.grid_x_)}, + {"grid_y", std::to_string(stat.grid_y_)}, + {"grid_z", std::to_string(stat.grid_z_)}, + {"block_x", std::to_string(stat.block_x_)}, + {"block_y", std::to_string(stat.block_y_)}, + {"block_z", std::to_string(stat.block_z_)}}; + events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, stat.name_, DUR(profiling_start, stat.stop_), DUR(stat.start_, stat.stop_), {args.begin(), args.end()}}); + } + stats_.clear(); + } else { + std::initializer_list> args; + events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, "not_available_due_to_cupti_error", 0, 0, {args.begin(), args.end()}}); + } + } + enabled_.clear(); + return events; +} +#endif //USE_CUDA + +DeviceProfiler* DeviceProfiler::GetDeviceProfiler() { +#ifdef USE_CUDA + static CudaProfiler cuda_profiler; + return &cuda_profiler; +#else + return nullptr; +#endif +} + std::atomic Profiler::global_max_num_events_{1000 * 1000}; #ifdef ENABLE_STATIC_PROFILER_INSTANCE @@ -16,7 +153,8 @@ profiling::Profiler::~Profiler() { instance_ = nullptr; } #else -profiling::Profiler::~Profiler() {} +profiling::Profiler::~Profiler() { +} #endif ::onnxruntime::TimePoint profiling::Profiler::StartTime() const { @@ -43,6 +181,10 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) { profile_with_logger_ = true; custom_logger_ = custom_logger; profiling_start_time_ = StartTime(); + DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler(); + if (device_profiler) { + device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId()); + } } template @@ -51,6 +193,10 @@ void Profiler::StartProfiling(const std::basic_string& file_name) { profile_stream_.open(file_name, std::ios::out | std::ios::trunc); profile_stream_file_ = ToMBString(file_name); profiling_start_time_ = StartTime(); + DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler(); + if (device_profiler) { + device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId()); + } } template void Profiler::StartProfiling(const std::basic_string& file_name); @@ -101,6 +247,12 @@ std::string Profiler::EndProfiling() { std::lock_guard lock(mutex_); profile_stream_ << "[\n"; + DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler(); + if (device_profiler) { + std::vector device_events = device_profiler->EndProfiling(); + std::copy(device_events.begin(), device_events.end(), std::back_inserter(events_)); + } + for (size_t i = 0; i < events_.size(); ++i) { auto& rec = events_[i]; profile_stream_ << R"({"cat" : ")" << event_categor_names_[rec.cat] << "\","; diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 4db59b3bc8..1e20226232 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -590,6 +590,11 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) { so.profile_file_prefix = ORT_TSTR("onnxprofile_profile_test"); InferenceSession session_object(so, GetEnvironment()); +#ifdef USE_CUDA + CUDAExecutionProviderInfo epi; + epi.device_id = 0; + EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique(epi)).IsOK()); +#endif ASSERT_STATUS_OK(session_object.Load(MODEL_URI)); ASSERT_STATUS_OK(session_object.Initialize()); @@ -602,25 +607,29 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) { std::ifstream profile(profile_file); ASSERT_TRUE(profile); std::string line; + std::vector lines; - std::vector tags = {"pid", "dur", "ts", "ph", "X", "name", "args"}; - int count = 0; while (std::getline(profile, line)) { - if (count == 0) { - ASSERT_TRUE(line.find("[") != string::npos); - } else if (count <= 7) { - for (auto& s : tags) { - ASSERT_TRUE(line.find(s) != string::npos); - } - } else { - ASSERT_TRUE(line.find("]") != string::npos); - } - - if (count == 1) { - ASSERT_TRUE(line.find("model_loading_uri") != string::npos); - } - count++; + lines.push_back(line); } + + auto size = lines.size(); + ASSERT_TRUE(size > 1); + ASSERT_TRUE(lines[0].find("[") != string::npos); + ASSERT_TRUE(lines[1].find("model_loading_uri") != string::npos); + ASSERT_TRUE(lines[size-1].find("]") != string::npos); + std::vector tags = {"pid", "dur", "ts", "ph", "X", "name", "args"}; + + bool has_kernel_info = false; + for (size_t i = 1; i < size - 1; ++i) { + for (auto& s : tags) { + ASSERT_TRUE(lines[i].find(s) != string::npos); + has_kernel_info = has_kernel_info || (lines[i].find("Kernel") != string::npos); + } + } +#ifdef USE_CUDA + ASSERT_TRUE(has_kernel_info); +#endif } TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) { @@ -641,24 +650,23 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) { std::ifstream profile(profile_file); std::string line; + std::vector lines; - std::vector tags = {"pid", "dur", "ts", "ph", "X", "name", "args"}; - int count = 0; while (std::getline(profile, line)) { - if (count == 0) { - ASSERT_TRUE(line.find("[") != string::npos); - } else if (count <= 5) { - for (auto& s : tags) { - ASSERT_TRUE(line.find(s) != string::npos); - } - } else { - ASSERT_TRUE(line.find("]") != string::npos); - } + lines.push_back(line); + } - if (count == 1) { - ASSERT_TRUE(line.find("mul_1_fence_before") != string::npos); + auto size = lines.size(); + ASSERT_TRUE(size > 1); + ASSERT_TRUE(lines[0].find("[") != string::npos); + ASSERT_TRUE(lines[1].find("mul_1_fence_before") != string::npos); + ASSERT_TRUE(lines[size - 1].find("]") != string::npos); + std::vector tags = {"pid", "dur", "ts", "ph", "X", "name", "args"}; + + for (size_t i = 1; i < size - 1; ++i) { + for (auto& s : tags) { + ASSERT_TRUE(lines[i].find(s) != string::npos); } - count++; } } diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index fd711f3537..545cd6e561 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -535,11 +535,13 @@ class TestInferenceSession(unittest.TestCase): tags = ['pid', 'dur', 'ts', 'ph', 'X', 'name', 'args'] with open(profile_file) as f: lines = f.readlines() + lines_len = len(lines) + self.assertTrue(lines_len > 8) self.assertTrue('[' in lines[0]) - for i in range(1, 8): + for i in range(1, lines_len-1): for tag in tags: self.assertTrue(tag in lines[i]) - self.assertTrue(']' in lines[8]) + self.assertTrue(']' in lines[-1]) def testProfilerGetStartTimeNs(self): def getSingleSessionProfilingStartTime(): diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat index 9e51d68256..b89dbb16db 100644 --- a/tools/ci_build/github/windows/setup_env_cuda.bat +++ b/tools/ci_build/github/windows/setup_env_cuda.bat @@ -1,2 +1,2 @@ -set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH% +set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false diff --git a/tools/ci_build/github/windows/setup_env_cuda_11.bat b/tools/ci_build/github/windows/setup_env_cuda_11.bat index 1ab8e7ee73..a10f5d7b68 100644 --- a/tools/ci_build/github/windows/setup_env_cuda_11.bat +++ b/tools/ci_build/github/windows/setup_env_cuda_11.bat @@ -1,2 +1,2 @@ -set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH% +set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\extras\CUPTI\lib64;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index f5ff7efb45..41524a3465 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -1,2 +1,2 @@ -set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH% +set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\extras\CUPTI\lib64;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false