From aeca7c29408f03ad9e19837085156cba3c117263 Mon Sep 17 00:00:00 2001
From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com>
Date: Mon, 29 Mar 2021 12:04:36 -0700
Subject: [PATCH] Cuda Profiler  (#7110)

* implement cuda profiler

* add counters

* downgrade cupti kernel version

* move mutex

* add cupti to path

* fix win gpu build err

* add path for cuda10

* fix linux com err

* extend include path

* add init flag

* fix test case

* fix tensorrt pipeline

* add UT

Co-authored-by: Ubuntu <randysheriff@rashuai-linux-gpu-3.3cfnmjowvu4e5bidlsmcxsmzwg.xx.internal.cloudapp.net>
---
 cmake/onnxruntime_common.cmake                |   6 +
 .../onnxruntime/core/common/logging/logging.h |   4 +-
 onnxruntime/core/common/profiler.cc           | 154 +++++++++++++++++-
 .../test/framework/inference_session_test.cc  |  68 ++++----
 .../test/python/onnxruntime_test_python.py    |   6 +-
 .../github/windows/setup_env_cuda.bat         |   2 +-
 .../github/windows/setup_env_cuda_11.bat      |   2 +-
 .../ci_build/github/windows/setup_env_trt.bat |   2 +-
 8 files changed, 207 insertions(+), 37 deletions(-)
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index c9dd4326cc..d414dd1f51 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -79,6 +79,12 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_common_src})
 
 add_library(onnxruntime_common ${onnxruntime_common_src})
 
+if (onnxruntime_USE_CUDA)
+  target_include_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
+  target_link_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
+  target_link_libraries(onnxruntime_common cupti)
+endif()
+
 if (onnxruntime_USE_TELEMETRY)
   set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
 endif()
diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index 96c07f09c4..95b6a4aa6d 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -55,6 +55,7 @@ namespace profiling {
 enum EventCategory {
   SESSION_EVENT = 0,
   NODE_EVENT,
+  KERNEL_EVENT,
   EVENT_CATEGORY_MAX
 };
 
@@ -63,7 +64,8 @@ Event descriptions for the above session events.
 */
 static constexpr const char* event_categor_names_[EVENT_CATEGORY_MAX] = {
     "Session",
-    "Node"};
+    "Node",
+    "Kernel"};
 
 /*
 Timing record for all events.
diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc
index ee752a6a62..3679501a47 100644
--- a/onnxruntime/core/common/profiler.cc
+++ b/onnxruntime/core/common/profiler.cc
@@ -2,11 +2,148 @@
 // Licensed under the MIT License.
 
 #include "profiler.h"
+#include <cmath>
+
+#ifdef USE_CUDA
+#include <cupti.h>
+#endif
 
 namespace onnxruntime {
 namespace profiling {
 using namespace std::chrono;
 
+class DeviceProfiler {
+ public:
+  static DeviceProfiler* GetDeviceProfiler();
+  virtual void StartProfiling(TimePoint start_time, int pid, int tid) = 0;
+  virtual std::vector<EventRecord> EndProfiling() = 0;
+  virtual ~DeviceProfiler() = default;
+};
+
+#ifdef USE_CUDA
+#define BUF_SIZE (32 * 1024)
+#define ALIGN_SIZE (8)
+#define ALIGN_BUFFER(buffer, align) \
+  (((uintptr_t)(buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) : (buffer))
+#define DUR(s, e) std::lround(static_cast<double>(e - s) / 1000)
+
+class CudaProfiler final: public DeviceProfiler {
+ public:
+  friend class DeviceProfiler;
+  ~CudaProfiler() = default;
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
+  void StartProfiling(TimePoint start_time, int pid, int tid) override;
+  std::vector<EventRecord> EndProfiling() override;
+ private:
+  CudaProfiler() = default;
+  static void CUPTIAPI BufferRequested(uint8_t**, size_t*, size_t*);
+  static void CUPTIAPI BufferCompleted(CUcontext, uint32_t, uint8_t*, size_t, size_t);
+  struct KernelStat {
+    std::string name_ = {};
+    uint32_t stream_ = 0;
+    int32_t grid_x_ = 0;
+    int32_t grid_y_ = 0;
+    int32_t grid_z_ = 0;
+    int32_t block_x_ = 0;
+    int32_t block_y_ = 0;
+    int32_t block_z_ = 0;
+    int64_t start_ = 0;
+    int64_t stop_ = 0;
+  };
+  static OrtMutex mutex_;
+  static std::vector<KernelStat> stats_;
+  bool initialized_ = false;
+  TimePoint start_time_;
+  int pid_ = 0;
+  int tid_ = 0;
+  static std::atomic_flag enabled_;
+};
+
+OrtMutex CudaProfiler::mutex_;
+std::vector<CudaProfiler::KernelStat> CudaProfiler::stats_;
+std::atomic_flag CudaProfiler::enabled_;
+
+void CUPTIAPI CudaProfiler::BufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) {
+  uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE);
+  ORT_ENFORCE(bfr, "Failed to allocate memory for cuda kernel profiling.");
+  *size = BUF_SIZE;
+  *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
+  *maxNumRecords = 0;
+}
+
+void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) {
+  CUptiResult status;
+  CUpti_Activity* record = NULL;
+  if (validSize > 0) {
+    std::unique_lock<OrtMutex> lock(mutex_);
+    do {
+      status = cuptiActivityGetNextRecord(buffer, validSize, &record);
+      if (status == CUPTI_SUCCESS) {
+        if (CUPTI_ACTIVITY_KIND_KERNEL == record->kind) {
+          CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4*)record;
+          stats_.push_back({kernel->name, kernel->streamId,
+                            kernel->gridX, kernel->gridY, kernel->gridZ,
+                            kernel->blockX, kernel->blockY, kernel->blockZ,
+                            static_cast<int64_t>(kernel->start),
+                            static_cast<int64_t>(kernel->end)});
+        }
+      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+        break;
+      }
+    } while (1);
+  }
+  free(buffer);
+}
+
+void CudaProfiler::StartProfiling(TimePoint start_time, int pid, int tid) {
+  if (!enabled_.test_and_set()) {
+    start_time_ = start_time;
+    pid_ = pid;
+    tid_ = tid;
+    if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) == CUPTI_SUCCESS &&
+        cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted) == CUPTI_SUCCESS) {
+      initialized_ = true;
+    }
+  }
+}
+
+std::vector<EventRecord> CudaProfiler::EndProfiling() {
+  std::vector<EventRecord> events;
+  if (enabled_.test_and_set()) {
+    if (initialized_) {
+      cuptiActivityFlushAll(1);
+      std::unique_lock<OrtMutex> lock(mutex_);
+      int64_t profiling_start = std::chrono::duration_cast<nanoseconds>(start_time_.time_since_epoch()).count();
+      for (const auto& stat : stats_) {
+        std::initializer_list<std::pair<std::string, std::string>> args = {{"stream", std::to_string(stat.stream_)},
+                                                                           {"grid_x", std::to_string(stat.grid_x_)},
+                                                                           {"grid_y", std::to_string(stat.grid_y_)},
+                                                                           {"grid_z", std::to_string(stat.grid_z_)},
+                                                                           {"block_x", std::to_string(stat.block_x_)},
+                                                                           {"block_y", std::to_string(stat.block_y_)},
+                                                                           {"block_z", std::to_string(stat.block_z_)}};
+        events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, stat.name_, DUR(profiling_start, stat.stop_), DUR(stat.start_, stat.stop_), {args.begin(), args.end()}});
+      }
+      stats_.clear();
+    } else {
+      std::initializer_list<std::pair<std::string, std::string>> args;
+      events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, "not_available_due_to_cupti_error", 0, 0, {args.begin(), args.end()}});
+    }
+  }
+  enabled_.clear();
+  return events;
+}
+#endif //USE_CUDA
+
+DeviceProfiler* DeviceProfiler::GetDeviceProfiler() {
+#ifdef USE_CUDA
+  static CudaProfiler cuda_profiler;
+  return &cuda_profiler;
+#else
+  return nullptr;
+#endif
+}
+
 std::atomic<size_t> Profiler::global_max_num_events_{1000 * 1000};
 
 #ifdef ENABLE_STATIC_PROFILER_INSTANCE
@@ -16,7 +153,8 @@ profiling::Profiler::~Profiler() {
   instance_ = nullptr;
 }
 #else
-profiling::Profiler::~Profiler() {}
+profiling::Profiler::~Profiler() {
+}
 #endif
 
 ::onnxruntime::TimePoint profiling::Profiler::StartTime() const {
@@ -43,6 +181,10 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) {
   profile_with_logger_ = true;
   custom_logger_ = custom_logger;
   profiling_start_time_ = StartTime();
+  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
+  if (device_profiler) {
+    device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
+  }
 }
 
 template <typename T>
@@ -51,6 +193,10 @@ void Profiler::StartProfiling(const std::basic_string<T>& file_name) {
   profile_stream_.open(file_name, std::ios::out | std::ios::trunc);
   profile_stream_file_ = ToMBString(file_name);
   profiling_start_time_ = StartTime();
+  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
+  if (device_profiler) {
+    device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
+  }
 }
 
 template void Profiler::StartProfiling<char>(const std::basic_string<char>& file_name);
@@ -101,6 +247,12 @@ std::string Profiler::EndProfiling() {
   std::lock_guard<OrtMutex> lock(mutex_);
   profile_stream_ << "[\n";
 
+  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
+  if (device_profiler) {
+    std::vector<EventRecord> device_events = device_profiler->EndProfiling();
+    std::copy(device_events.begin(), device_events.end(), std::back_inserter(events_));
+  }
+
   for (size_t i = 0; i < events_.size(); ++i) {
     auto& rec = events_[i];
     profile_stream_ << R"({"cat" : ")" << event_categor_names_[rec.cat] << "\",";
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 4db59b3bc8..1e20226232 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -590,6 +590,11 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
   so.profile_file_prefix = ORT_TSTR("onnxprofile_profile_test");
 
   InferenceSession session_object(so, GetEnvironment());
+#ifdef USE_CUDA
+  CUDAExecutionProviderInfo epi;
+  epi.device_id = 0;
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique<CUDAExecutionProvider>(epi)).IsOK());
+#endif
   ASSERT_STATUS_OK(session_object.Load(MODEL_URI));
   ASSERT_STATUS_OK(session_object.Initialize());
 
@@ -602,25 +607,29 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
   std::ifstream profile(profile_file);
   ASSERT_TRUE(profile);
   std::string line;
+  std::vector<std::string> lines;
 
-  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
-  int count = 0;
   while (std::getline(profile, line)) {
-    if (count == 0) {
-      ASSERT_TRUE(line.find("[") != string::npos);
-    } else if (count <= 7) {
-      for (auto& s : tags) {
-        ASSERT_TRUE(line.find(s) != string::npos);
-      }
-    } else {
-      ASSERT_TRUE(line.find("]") != string::npos);
-    }
-
-    if (count == 1) {
-      ASSERT_TRUE(line.find("model_loading_uri") != string::npos);
-    }
-    count++;
+    lines.push_back(line);
   }
+
+  auto size = lines.size();
+  ASSERT_TRUE(size > 1);
+  ASSERT_TRUE(lines[0].find("[") != string::npos);
+  ASSERT_TRUE(lines[1].find("model_loading_uri") != string::npos);
+  ASSERT_TRUE(lines[size-1].find("]") != string::npos);
+  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
+
+  bool has_kernel_info = false;
+  for (size_t i = 1; i < size - 1; ++i) {
+    for (auto& s : tags) {
+      ASSERT_TRUE(lines[i].find(s) != string::npos);
+      has_kernel_info = has_kernel_info || (lines[i].find("Kernel") != string::npos);
+    }
+  }
+#ifdef USE_CUDA
+  ASSERT_TRUE(has_kernel_info);
+#endif
 }
 
 TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
@@ -641,24 +650,23 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
 
   std::ifstream profile(profile_file);
   std::string line;
+  std::vector<std::string> lines;
 
-  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
-  int count = 0;
   while (std::getline(profile, line)) {
-    if (count == 0) {
-      ASSERT_TRUE(line.find("[") != string::npos);
-    } else if (count <= 5) {
-      for (auto& s : tags) {
-        ASSERT_TRUE(line.find(s) != string::npos);
-      }
-    } else {
-      ASSERT_TRUE(line.find("]") != string::npos);
-    }
+    lines.push_back(line);
+  }
 
-    if (count == 1) {
-      ASSERT_TRUE(line.find("mul_1_fence_before") != string::npos);
+  auto size = lines.size();
+  ASSERT_TRUE(size > 1);
+  ASSERT_TRUE(lines[0].find("[") != string::npos);
+  ASSERT_TRUE(lines[1].find("mul_1_fence_before") != string::npos);
+  ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
+  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
+
+  for (size_t i = 1; i < size - 1; ++i) {
+    for (auto& s : tags) {
+      ASSERT_TRUE(lines[i].find(s) != string::npos);
     }
-    count++;
   }
 }
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index fd711f3537..545cd6e561 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -535,11 +535,13 @@ class TestInferenceSession(unittest.TestCase):
         tags = ['pid', 'dur', 'ts', 'ph', 'X', 'name', 'args']
         with open(profile_file) as f:
             lines = f.readlines()
+            lines_len = len(lines)
+            self.assertTrue(lines_len > 8)
             self.assertTrue('[' in lines[0])
-            for i in range(1, 8):
+            for i in range(1, lines_len-1):
                 for tag in tags:
                     self.assertTrue(tag in lines[i])
-            self.assertTrue(']' in lines[8])
+            self.assertTrue(']' in lines[-1])
 
     def testProfilerGetStartTimeNs(self):
         def getSingleSessionProfilingStartTime():
diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
index 9e51d68256..b89dbb16db 100644
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
diff --git a/tools/ci_build/github/windows/setup_env_cuda_11.bat b/tools/ci_build/github/windows/setup_env_cuda_11.bat
index 1ab8e7ee73..a10f5d7b68 100644
--- a/tools/ci_build/github/windows/setup_env_cuda_11.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda_11.bat
@@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\extras\CUPTI\lib64;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index f5ff7efb45..41524a3465 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\extras\CUPTI\lib64;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false