Cuda Profiler (#7110)

* implement cuda profiler * add counters * downgrade cupti kernel version * move mutex * add cupti to path * fix win gpu build err * add path for cuda10 * fix linux com err * extend include path * add init flag * fix test case * fix tensorrt pipeline * add UT Co-authored-by: Ubuntu <randysheriff@rashuai-linux-gpu-3.3cfnmjowvu4e5bidlsmcxsmzwg.xx.internal.cloudapp.net>
2026-05-14 20:48:00 +00:00 · 2021-03-29 12:04:36 -07:00 · 2021-03-29 12:04:36 -07:00 · aeca7c2940
commit aeca7c2940
parent b22e60bd44
8 changed files with 207 additions and 37 deletions
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@ -79,6 +79,12 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_common_src})

 add_library(onnxruntime_common ${onnxruntime_common_src})

+if (onnxruntime_USE_CUDA)
+  target_include_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
+  target_link_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
+  target_link_libraries(onnxruntime_common cupti)
+endif()
+
 if (onnxruntime_USE_TELEMETRY)
  set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
 endif()
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@ -55,6 +55,7 @@ namespace profiling {
 enum EventCategory {
  SESSION_EVENT = 0,
  NODE_EVENT,
+  KERNEL_EVENT,
  EVENT_CATEGORY_MAX
 };

@ -63,7 +64,8 @@ Event descriptions for the above session events.
 */
 static constexpr const char* event_categor_names_[EVENT_CATEGORY_MAX] = {
    "Session",
-    "Node"};
+    "Node",
+    "Kernel"};

 /*
 Timing record for all events.
--- a/onnxruntime/core/common/profiler.cc
+++ b/onnxruntime/core/common/profiler.cc
@ -2,11 +2,148 @@
 // Licensed under the MIT License.

 #include "profiler.h"
+#include <cmath>
+
+#ifdef USE_CUDA
+#include <cupti.h>
+#endif

 namespace onnxruntime {
 namespace profiling {
 using namespace std::chrono;

+class DeviceProfiler {
+ public:
+  static DeviceProfiler* GetDeviceProfiler();
+  virtual void StartProfiling(TimePoint start_time, int pid, int tid) = 0;
+  virtual std::vector<EventRecord> EndProfiling() = 0;
+  virtual ~DeviceProfiler() = default;
+};
+
+#ifdef USE_CUDA
+#define BUF_SIZE (32 * 1024)
+#define ALIGN_SIZE (8)
+#define ALIGN_BUFFER(buffer, align) \
+  (((uintptr_t)(buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) : (buffer))
+#define DUR(s, e) std::lround(static_cast<double>(e - s) / 1000)
+
+class CudaProfiler final: public DeviceProfiler {
+ public:
+  friend class DeviceProfiler;
+  ~CudaProfiler() = default;
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
+  void StartProfiling(TimePoint start_time, int pid, int tid) override;
+  std::vector<EventRecord> EndProfiling() override;
+ private:
+  CudaProfiler() = default;
+  static void CUPTIAPI BufferRequested(uint8_t**, size_t*, size_t*);
+  static void CUPTIAPI BufferCompleted(CUcontext, uint32_t, uint8_t*, size_t, size_t);
+  struct KernelStat {
+    std::string name_ = {};
+    uint32_t stream_ = 0;
+    int32_t grid_x_ = 0;
+    int32_t grid_y_ = 0;
+    int32_t grid_z_ = 0;
+    int32_t block_x_ = 0;
+    int32_t block_y_ = 0;
+    int32_t block_z_ = 0;
+    int64_t start_ = 0;
+    int64_t stop_ = 0;
+  };
+  static OrtMutex mutex_;
+  static std::vector<KernelStat> stats_;
+  bool initialized_ = false;
+  TimePoint start_time_;
+  int pid_ = 0;
+  int tid_ = 0;
+  static std::atomic_flag enabled_;
+};
+
+OrtMutex CudaProfiler::mutex_;
+std::vector<CudaProfiler::KernelStat> CudaProfiler::stats_;
+std::atomic_flag CudaProfiler::enabled_;
+
+void CUPTIAPI CudaProfiler::BufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) {
+  uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE);
+  ORT_ENFORCE(bfr, "Failed to allocate memory for cuda kernel profiling.");
+  *size = BUF_SIZE;
+  *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
+  *maxNumRecords = 0;
+}
+
+void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) {
+  CUptiResult status;
+  CUpti_Activity* record = NULL;
+  if (validSize > 0) {
+    std::unique_lock<OrtMutex> lock(mutex_);
+    do {
+      status = cuptiActivityGetNextRecord(buffer, validSize, &record);
+      if (status == CUPTI_SUCCESS) {
+        if (CUPTI_ACTIVITY_KIND_KERNEL == record->kind) {
+          CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4*)record;
+          stats_.push_back({kernel->name, kernel->streamId,
+                            kernel->gridX, kernel->gridY, kernel->gridZ,
+                            kernel->blockX, kernel->blockY, kernel->blockZ,
+                            static_cast<int64_t>(kernel->start),
+                            static_cast<int64_t>(kernel->end)});
+        }
+      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+        break;
+      }
+    } while (1);
+  }
+  free(buffer);
+}
+
+void CudaProfiler::StartProfiling(TimePoint start_time, int pid, int tid) {
+  if (!enabled_.test_and_set()) {
+    start_time_ = start_time;
+    pid_ = pid;
+    tid_ = tid;
+    if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) == CUPTI_SUCCESS &&
+        cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted) == CUPTI_SUCCESS) {
+      initialized_ = true;
+    }
+  }
+}
+
+std::vector<EventRecord> CudaProfiler::EndProfiling() {
+  std::vector<EventRecord> events;
+  if (enabled_.test_and_set()) {
+    if (initialized_) {
+      cuptiActivityFlushAll(1);
+      std::unique_lock<OrtMutex> lock(mutex_);
+      int64_t profiling_start = std::chrono::duration_cast<nanoseconds>(start_time_.time_since_epoch()).count();
+      for (const auto& stat : stats_) {
+        std::initializer_list<std::pair<std::string, std::string>> args = {{"stream", std::to_string(stat.stream_)},
+                                                                           {"grid_x", std::to_string(stat.grid_x_)},
+                                                                           {"grid_y", std::to_string(stat.grid_y_)},
+                                                                           {"grid_z", std::to_string(stat.grid_z_)},
+                                                                           {"block_x", std::to_string(stat.block_x_)},
+                                                                           {"block_y", std::to_string(stat.block_y_)},
+                                                                           {"block_z", std::to_string(stat.block_z_)}};
+        events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, stat.name_, DUR(profiling_start, stat.stop_), DUR(stat.start_, stat.stop_), {args.begin(), args.end()}});
+      }
+      stats_.clear();
+    } else {
+      std::initializer_list<std::pair<std::string, std::string>> args;
+      events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, "not_available_due_to_cupti_error", 0, 0, {args.begin(), args.end()}});
+    }
+  }
+  enabled_.clear();
+  return events;
+}
+#endif //USE_CUDA
+
+DeviceProfiler* DeviceProfiler::GetDeviceProfiler() {
+#ifdef USE_CUDA
+  static CudaProfiler cuda_profiler;
+  return &cuda_profiler;
+#else
+  return nullptr;
+#endif
+}
+
 std::atomic<size_t> Profiler::global_max_num_events_{1000 * 1000};

 #ifdef ENABLE_STATIC_PROFILER_INSTANCE
@ -16,7 +153,8 @@ profiling::Profiler::~Profiler() {
  instance_ = nullptr;
 }
 #else
-profiling::Profiler::~Profiler() {}
+profiling::Profiler::~Profiler() {
+}
 #endif

 ::onnxruntime::TimePoint profiling::Profiler::StartTime() const {
@ -43,6 +181,10 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) {
  profile_with_logger_ = true;
  custom_logger_ = custom_logger;
  profiling_start_time_ = StartTime();
+  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
+  if (device_profiler) {
+    device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
+  }
 }

 template <typename T>
@ -51,6 +193,10 @@ void Profiler::StartProfiling(const std::basic_string<T>& file_name) {
  profile_stream_.open(file_name, std::ios::out | std::ios::trunc);
  profile_stream_file_ = ToMBString(file_name);
  profiling_start_time_ = StartTime();
+  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
+  if (device_profiler) {
+    device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
+  }
 }

 template void Profiler::StartProfiling<char>(const std::basic_string<char>& file_name);
@ -101,6 +247,12 @@ std::string Profiler::EndProfiling() {
  std::lock_guard<OrtMutex> lock(mutex_);
  profile_stream_ << "[\n";

+  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
+  if (device_profiler) {
+    std::vector<EventRecord> device_events = device_profiler->EndProfiling();
+    std::copy(device_events.begin(), device_events.end(), std::back_inserter(events_));
+  }
+
  for (size_t i = 0; i < events_.size(); ++i) {
    auto& rec = events_[i];
    profile_stream_ << R"({"cat" : ")" << event_categor_names_[rec.cat] << "\",";
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@ -590,6 +590,11 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
  so.profile_file_prefix = ORT_TSTR("onnxprofile_profile_test");

  InferenceSession session_object(so, GetEnvironment());
+#ifdef USE_CUDA
+  CUDAExecutionProviderInfo epi;
+  epi.device_id = 0;
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique<CUDAExecutionProvider>(epi)).IsOK());
+#endif
  ASSERT_STATUS_OK(session_object.Load(MODEL_URI));
  ASSERT_STATUS_OK(session_object.Initialize());

@ -602,25 +607,29 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
  std::ifstream profile(profile_file);
  ASSERT_TRUE(profile);
  std::string line;
+  std::vector<std::string> lines;

-  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
-  int count = 0;
  while (std::getline(profile, line)) {
-    if (count == 0) {
-      ASSERT_TRUE(line.find("[") != string::npos);
-    } else if (count <= 7) {
-      for (auto& s : tags) {
-        ASSERT_TRUE(line.find(s) != string::npos);
-      }
-    } else {
-      ASSERT_TRUE(line.find("]") != string::npos);
-    }
-
-    if (count == 1) {
-      ASSERT_TRUE(line.find("model_loading_uri") != string::npos);
-    }
-    count++;
+    lines.push_back(line);
  }
+
+  auto size = lines.size();
+  ASSERT_TRUE(size > 1);
+  ASSERT_TRUE(lines[0].find("[") != string::npos);
+  ASSERT_TRUE(lines[1].find("model_loading_uri") != string::npos);
+  ASSERT_TRUE(lines[size-1].find("]") != string::npos);
+  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
+
+  bool has_kernel_info = false;
+  for (size_t i = 1; i < size - 1; ++i) {
+    for (auto& s : tags) {
+      ASSERT_TRUE(lines[i].find(s) != string::npos);
+      has_kernel_info = has_kernel_info || (lines[i].find("Kernel") != string::npos);
+    }
+  }
+#ifdef USE_CUDA
+  ASSERT_TRUE(has_kernel_info);
+#endif
 }

 TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
@ -641,24 +650,23 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {

  std::ifstream profile(profile_file);
  std::string line;
+  std::vector<std::string> lines;

-  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
-  int count = 0;
  while (std::getline(profile, line)) {
-    if (count == 0) {
-      ASSERT_TRUE(line.find("[") != string::npos);
-    } else if (count <= 5) {
-      for (auto& s : tags) {
-        ASSERT_TRUE(line.find(s) != string::npos);
-      }
-    } else {
-      ASSERT_TRUE(line.find("]") != string::npos);
-    }
+    lines.push_back(line);
+  }

-    if (count == 1) {
-      ASSERT_TRUE(line.find("mul_1_fence_before") != string::npos);
+  auto size = lines.size();
+  ASSERT_TRUE(size > 1);
+  ASSERT_TRUE(lines[0].find("[") != string::npos);
+  ASSERT_TRUE(lines[1].find("mul_1_fence_before") != string::npos);
+  ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
+  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
+
+  for (size_t i = 1; i < size - 1; ++i) {
+    for (auto& s : tags) {
+      ASSERT_TRUE(lines[i].find(s) != string::npos);
    }
-    count++;
  }
 }

--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@ -535,11 +535,13 @@ class TestInferenceSession(unittest.TestCase):
        tags = ['pid', 'dur', 'ts', 'ph', 'X', 'name', 'args']
        with open(profile_file) as f:
            lines = f.readlines()
+            lines_len = len(lines)
+            self.assertTrue(lines_len > 8)
            self.assertTrue('[' in lines[0])
-            for i in range(1, 8):
+            for i in range(1, lines_len-1):
                for tag in tags:
                    self.assertTrue(tag in lines[i])
-            self.assertTrue(']' in lines[8])
+            self.assertTrue(']' in lines[-1])

    def testProfilerGetStartTimeNs(self):
        def getSingleSessionProfilingStartTime():
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
--- a/tools/ci_build/github/windows/setup_env_cuda_11.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda_11.bat
@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\extras\CUPTI\lib64;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\extras\CUPTI\lib64;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false