Revert CUPTI profiling feature (#7763)

For unknown reason it causes deadlocks when it is used with CUDA 11.1
2026-07-18 18:52:16 +00:00 · 2021-05-19 21:54:29 -07:00 · 2021-05-19 21:54:29 -07:00 · 31e6d3f85c
commit 31e6d3f85c
parent e26c668a9b
13 changed files with 52 additions and 247 deletions
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@ -90,12 +90,6 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_common_src})

 onnxruntime_add_static_library(onnxruntime_common ${onnxruntime_common_src})

-if (onnxruntime_USE_CUDA)
-  target_include_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
-  target_link_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
-  target_link_libraries(onnxruntime_common cupti)
-endif()
-
 if (onnxruntime_USE_TELEMETRY)
  set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
 endif()
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@ -55,7 +55,6 @@ namespace profiling {
 enum EventCategory {
  SESSION_EVENT = 0,
  NODE_EVENT,
-  KERNEL_EVENT,
  EVENT_CATEGORY_MAX
 };

@ -64,8 +63,7 @@ Event descriptions for the above session events.
 */
 static constexpr const char* event_categor_names_[EVENT_CATEGORY_MAX] = {
    "Session",
-    "Node",
-    "Kernel"};
+    "Node"};

 /*
 Timing record for all events.
--- a/onnxruntime/core/common/profiler.cc
+++ b/onnxruntime/core/common/profiler.cc
@ -2,148 +2,11 @@
 // Licensed under the MIT License.

 #include "profiler.h"
-#include <cmath>
-
-#ifdef USE_CUDA
-#include <cupti.h>
-#endif

 namespace onnxruntime {
 namespace profiling {
 using namespace std::chrono;

-class DeviceProfiler {
- public:
-  static DeviceProfiler* GetDeviceProfiler();
-  virtual void StartProfiling(TimePoint start_time, int pid, int tid) = 0;
-  virtual std::vector<EventRecord> EndProfiling() = 0;
-  virtual ~DeviceProfiler() = default;
-};
-
-#ifdef USE_CUDA
-#define BUF_SIZE (32 * 1024)
-#define ALIGN_SIZE (8)
-#define ALIGN_BUFFER(buffer, align) \
-  (((uintptr_t)(buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) : (buffer))
-#define DUR(s, e) std::lround(static_cast<double>(e - s) / 1000)
-
-class CudaProfiler final: public DeviceProfiler {
- public:
-  friend class DeviceProfiler;
-  ~CudaProfiler() = default;
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
-  void StartProfiling(TimePoint start_time, int pid, int tid) override;
-  std::vector<EventRecord> EndProfiling() override;
- private:
-  CudaProfiler() = default;
-  static void CUPTIAPI BufferRequested(uint8_t**, size_t*, size_t*);
-  static void CUPTIAPI BufferCompleted(CUcontext, uint32_t, uint8_t*, size_t, size_t);
-  struct KernelStat {
-    std::string name_ = {};
-    uint32_t stream_ = 0;
-    int32_t grid_x_ = 0;
-    int32_t grid_y_ = 0;
-    int32_t grid_z_ = 0;
-    int32_t block_x_ = 0;
-    int32_t block_y_ = 0;
-    int32_t block_z_ = 0;
-    int64_t start_ = 0;
-    int64_t stop_ = 0;
-  };
-  static OrtMutex mutex_;
-  static std::vector<KernelStat> stats_;
-  bool initialized_ = false;
-  TimePoint start_time_;
-  int pid_ = 0;
-  int tid_ = 0;
-  static std::atomic_flag enabled_;
-};
-
-OrtMutex CudaProfiler::mutex_;
-std::vector<CudaProfiler::KernelStat> CudaProfiler::stats_;
-std::atomic_flag CudaProfiler::enabled_;
-
-void CUPTIAPI CudaProfiler::BufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) {
-  uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE);
-  ORT_ENFORCE(bfr, "Failed to allocate memory for cuda kernel profiling.");
-  *size = BUF_SIZE;
-  *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
-  *maxNumRecords = 0;
-}
-
-void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) {
-  CUptiResult status;
-  CUpti_Activity* record = NULL;
-  if (validSize > 0) {
-    std::unique_lock<OrtMutex> lock(mutex_);
-    do {
-      status = cuptiActivityGetNextRecord(buffer, validSize, &record);
-      if (status == CUPTI_SUCCESS) {
-        if (CUPTI_ACTIVITY_KIND_KERNEL == record->kind) {
-          CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4*)record;
-          stats_.push_back({kernel->name, kernel->streamId,
-                            kernel->gridX, kernel->gridY, kernel->gridZ,
-                            kernel->blockX, kernel->blockY, kernel->blockZ,
-                            static_cast<int64_t>(kernel->start),
-                            static_cast<int64_t>(kernel->end)});
-        }
-      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-        break;
-      }
-    } while (1);
-  }
-  free(buffer);
-}
-
-void CudaProfiler::StartProfiling(TimePoint start_time, int pid, int tid) {
-  if (!enabled_.test_and_set()) {
-    start_time_ = start_time;
-    pid_ = pid;
-    tid_ = tid;
-    if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) == CUPTI_SUCCESS &&
-        cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted) == CUPTI_SUCCESS) {
-      initialized_ = true;
-    }
-  }
-}
-
-std::vector<EventRecord> CudaProfiler::EndProfiling() {
-  std::vector<EventRecord> events;
-  if (enabled_.test_and_set()) {
-    if (initialized_) {
-      cuptiActivityFlushAll(1);
-      std::unique_lock<OrtMutex> lock(mutex_);
-      int64_t profiling_start = std::chrono::duration_cast<nanoseconds>(start_time_.time_since_epoch()).count();
-      for (const auto& stat : stats_) {
-        std::initializer_list<std::pair<std::string, std::string>> args = {{"stream", std::to_string(stat.stream_)},
-                                                                           {"grid_x", std::to_string(stat.grid_x_)},
-                                                                           {"grid_y", std::to_string(stat.grid_y_)},
-                                                                           {"grid_z", std::to_string(stat.grid_z_)},
-                                                                           {"block_x", std::to_string(stat.block_x_)},
-                                                                           {"block_y", std::to_string(stat.block_y_)},
-                                                                           {"block_z", std::to_string(stat.block_z_)}};
-        events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, stat.name_, DUR(profiling_start, stat.stop_), DUR(stat.start_, stat.stop_), {args.begin(), args.end()}});
-      }
-      stats_.clear();
-    } else {
-      std::initializer_list<std::pair<std::string, std::string>> args;
-      events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, "not_available_due_to_cupti_error", 0, 0, {args.begin(), args.end()}});
-    }
-  }
-  enabled_.clear();
-  return events;
-}
-#endif //USE_CUDA
-
-DeviceProfiler* DeviceProfiler::GetDeviceProfiler() {
-#ifdef USE_CUDA
-  static CudaProfiler cuda_profiler;
-  return &cuda_profiler;
-#else
-  return nullptr;
-#endif
-}
-
 std::atomic<size_t> Profiler::global_max_num_events_{1000 * 1000};

 #ifdef ENABLE_STATIC_PROFILER_INSTANCE
@ -153,11 +16,10 @@ profiling::Profiler::~Profiler() {
  instance_ = nullptr;
 }
 #else
-profiling::Profiler::~Profiler() {
-}
+profiling::Profiler::~Profiler() {}
 #endif

-::onnxruntime::TimePoint profiling::Profiler::Now() const {
+::onnxruntime::TimePoint profiling::Profiler::StartTime() const {
  ORT_ENFORCE(enabled_);
  return std::chrono::high_resolution_clock::now();
 }
@ -180,11 +42,7 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) {
  enabled_ = true;
  profile_with_logger_ = true;
  custom_logger_ = custom_logger;
-  profiling_start_time_ = Now();
-  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
-  if (device_profiler) {
-    device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
-  }
+  profiling_start_time_ = StartTime();
 }

 template <typename T>
@ -192,11 +50,7 @@ void Profiler::StartProfiling(const std::basic_string<T>& file_name) {
  enabled_ = true;
  profile_stream_.open(file_name, std::ios::out | std::ios::trunc);
  profile_stream_file_ = ToMBString(file_name);
-  profiling_start_time_ = Now();
-  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
-  if (device_profiler) {
-    device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
-  }
+  profiling_start_time_ = StartTime();
 }

 template void Profiler::StartProfiling<char>(const std::basic_string<char>& file_name);
@ -204,32 +58,16 @@ template void Profiler::StartProfiling<char>(const std::basic_string<char>& file
 template void Profiler::StartProfiling<wchar_t>(const std::basic_string<wchar_t>& file_name);
 #endif

-void Profiler::EndTimeAndRecordEvent(EventCategory category,
-                                     const std::string& event_name,
-                                     const TimePoint& start_time, const TimePoint& end_time,
-                                     const std::initializer_list<std::pair<std::string, std::string>>& event_args,
-                                     bool sync_gpu) {
-  EndTimeAndRecordEvent(category, event_name, TimeDiffMicroSeconds(start_time, end_time),
-                        TimeDiffMicroSeconds(profiling_start_time_, start_time), event_args, sync_gpu);
-}
-
 void Profiler::EndTimeAndRecordEvent(EventCategory category,
                                     const std::string& event_name,
                                     const TimePoint& start_time,
                                     const std::initializer_list<std::pair<std::string, std::string>>& event_args,
-                                     bool sync_gpu) {
-  EndTimeAndRecordEvent(category, event_name, TimeDiffMicroSeconds(start_time),
-                        TimeDiffMicroSeconds(profiling_start_time_, start_time), event_args, sync_gpu);
-}
-
-void Profiler::EndTimeAndRecordEvent(EventCategory category,
-                                     const std::string& event_name,
-                                     long long duration,         //duration of the op
-                                     long long time_from_start,  //time difference between op start time and profiler start time
-                                     const std::initializer_list<std::pair<std::string, std::string>>& event_args,
                                     bool /*sync_gpu*/) {
+  long long dur = TimeDiffMicroSeconds(start_time);
+  long long ts = TimeDiffMicroSeconds(profiling_start_time_, start_time);
+
  EventRecord event(category, logging::GetProcessId(),
-                    logging::GetThreadId(), event_name, time_from_start, duration, {event_args.begin(), event_args.end()});
+                    logging::GetThreadId(), event_name, ts, dur, {event_args.begin(), event_args.end()});
  if (profile_with_logger_) {
    custom_logger_->SendProfileEvent(event);
  } else {
@ -263,12 +101,6 @@ std::string Profiler::EndProfiling() {
  std::lock_guard<OrtMutex> lock(mutex_);
  profile_stream_ << "[\n";

-  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
-  if (device_profiler) {
-    std::vector<EventRecord> device_events = device_profiler->EndProfiling();
-    std::copy(device_events.begin(), device_events.end(), std::back_inserter(events_));
-  }
-
  for (size_t i = 0; i < events_.size(); ++i) {
    auto& rec = events_[i];
    profile_stream_ << R"({"cat" : ")" << event_categor_names_[rec.cat] << "\",";
--- a/onnxruntime/core/common/profiler.h
+++ b/onnxruntime/core/common/profiler.h
@ -51,7 +51,7 @@ class Profiler {
  /*
  Produce current time point for any profiling action.
  */
-  TimePoint Now() const;
+  TimePoint StartTime() const;

  /*
  Whether data collection and output from this profiler is enabled.
@ -78,12 +78,6 @@ class Profiler {
                             const std::initializer_list<std::pair<std::string, std::string>>& event_args = {},
                             bool sync_gpu = false);

-  void EndTimeAndRecordEvent(EventCategory category,
-                             const std::string& event_name,
-                             const TimePoint& start_time, const TimePoint& end_time,
-                             const std::initializer_list<std::pair<std::string, std::string>>& event_args = {},
-                             bool sync_gpu = false);
-
  /*
  Write profile data to the given stream in chrome format defined below.
  https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview#
@ -115,13 +109,6 @@ class Profiler {
  }

 private:
-  void EndTimeAndRecordEvent(EventCategory category,
-                             const std::string& event_name,
-                             long long duration,         //duration of the op
-                             long long time_from_start,  //time difference between op start time and profiler start time
-                             const std::initializer_list<std::pair<std::string, std::string>>& event_args,
-                             bool sync_gpu = false);
-
  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Profiler);

  /**
--- a/onnxruntime/core/framework/orttraining_partial_executor.cc
+++ b/onnxruntime/core/framework/orttraining_partial_executor.cc
@ -144,7 +144,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
  size_t total_output_sizes = 0;

  if (is_profiler_enabled) {
-    tp = session_state.Profiler().Now();
+    tp = session_state.Profiler().StartTime();
  }

  ExecutionFrame& frame = state_.GetExecutionFrame(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches,
@ -235,7 +235,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
    OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger, false);
    // TODO: log kernel outputs?
    if (is_profiler_enabled) {
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
    }

    // sync before compute
@ -289,7 +289,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
      // call compute on the kernel
      VLOGS(logger, 1) << "Computing kernel: " << node_name_for_profiling;

-      kernel_begin_time = session_state.Profiler().Now();
+      kernel_begin_time = session_state.Profiler().StartTime();

      // Calculate total input sizes for this operation.
      CalculateTotalInputSizes(&op_kernel_context, p_op_kernel,
@ -373,7 +373,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
                                                          concurrency::ThreadPool::StopProfiling(
                                                              session_state.GetThreadPool())},
                                                     });
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
    }

    // sync after compute for outputs
@ -493,4 +493,4 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
  return Status::OK();
 }
 }  // namespace onnxruntime
-#endif
+#endif
--- a/onnxruntime/core/framework/parallel_executor.cc
+++ b/onnxruntime/core/framework/parallel_executor.cc
@ -35,7 +35,7 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v
  TimePoint tp;
  const bool is_profiler_enabled = session_state.Profiler().IsEnabled();
  if (is_profiler_enabled) {
-    tp = session_state.Profiler().Now();
+    tp = session_state.Profiler().StartTime();
  }

  root_frame_ = std::make_unique<ExecutionFrame>(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches,
@ -118,7 +118,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
  bool keep_running = true;
  const auto& graph_viewer = session_state.GetGraphViewer();
  TimePoint sync_time_begin;
-  TimePoint kernel_begin_time, kernel_end_time;
+  TimePoint kernel_begin_time;
  const bool f_profiler_enabled = session_state.Profiler().IsEnabled();
  const SequentialExecutionPlan& exec_plan = *session_state.GetExecutionPlan();

@ -142,7 +142,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
    OpKernelContextInternal op_kernel_context(session_state, *root_frame_, *p_op_kernel, logger, terminate_flag_);

    if (f_profiler_enabled) {
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
    }
    // sync before compute
    int queue_id = p_op_kernel->KernelDef().ExecQueueId();
@ -183,7 +183,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
                                                     sync_time_begin,
                                                     {{"op_name", p_op_kernel->KernelDef().OpName()}});
      concurrency::ThreadPool::StartProfiling(session_state.GetThreadPool());
-      kernel_begin_time = session_state.Profiler().Now();
+      kernel_begin_time = session_state.Profiler().StartTime();
    }

    // call compute on the kernel
@ -216,15 +216,14 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
    }

    if (f_profiler_enabled) {
-      kernel_end_time = session_state.Profiler().Now();
      session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
                                                     node.Name() + "_kernel_time",
-                                                     kernel_begin_time, kernel_end_time,
+                                                     kernel_begin_time,
                                                     {{"op_name", p_op_kernel->KernelDef().OpName()},
                                                      {"provider", p_op_kernel->KernelDef().Provider()},
                                                      {"thread_scheduling_stats", concurrency::ThreadPool::StopProfiling(session_state.GetThreadPool())}});

-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
    }
    // sync after compute for outputs
    if (exec_plan.NodeHasFence(node_index)) {
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@ -130,13 +130,13 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
  const bool is_profiler_enabled = session_state.Profiler().IsEnabled();
  TimePoint tp;
  TimePoint sync_time_begin;
-  TimePoint kernel_begin_time, kernel_end_time;
+  TimePoint kernel_begin_time;
  size_t input_activation_sizes = 0;
  size_t input_parameter_sizes = 0;
  size_t total_output_sizes = 0;

  if (is_profiler_enabled) {
-    tp = session_state.Profiler().Now();
+    tp = session_state.Profiler().StartTime();
  }

  ExecutionFrame frame{feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches, fetch_allocators, session_state};
@ -235,7 +235,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
    OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger, terminate_flag_);
    // TODO: log kernel outputs?
    if (is_profiler_enabled) {
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
    }

    // sync before compute
@ -289,10 +289,11 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
      // call compute on the kernel
      VLOGS(logger, 1) << "Computing kernel: " << node_name_for_profiling;

+      kernel_begin_time = session_state.Profiler().StartTime();
+
      // Calculate total input sizes for this operation.
      CalculateTotalInputSizes(&op_kernel_context, p_op_kernel,
                               input_activation_sizes, input_parameter_sizes, node_name_for_profiling);
-      kernel_begin_time = session_state.Profiler().Now();
    }

    Status compute_status;
@ -340,7 +341,6 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
    }

    if (is_profiler_enabled) {
-      kernel_end_time = session_state.Profiler().Now();
      // Calculate total output sizes for this operation.
      CalculateTotalOutputSizes(&op_kernel_context, total_output_sizes, node_name_for_profiling);

@ -356,9 +356,10 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
                << " Output_Size=" << total_output_sizes
                << "\n";
 #endif
+
      session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
                                                     node_name_for_profiling + "_kernel_time",
-                                                     kernel_begin_time, kernel_end_time,
+                                                     kernel_begin_time,
                                                     // Log additional operation args / info.
                                                     {
                                                         {"op_name", p_op_kernel->KernelDef().OpName()},
@ -370,7 +371,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
                                                         {"output_size", std::to_string(total_output_sizes)},
                                                         {"thread_scheduling_stats", concurrency::ThreadPool::StopProfiling(session_state.GetThreadPool())},
                                                     });
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
    }

    // sync after compute for outputs
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@ -564,7 +564,7 @@ common::Status InferenceSession::Load(std::function<common::Status(std::shared_p
  Status status = Status::OK();
  TimePoint tp;
  if (session_profiler_.IsEnabled()) {
-    tp = session_profiler_.Now();
+    tp = session_profiler_.StartTime();
  }
  ORT_TRY {
    std::lock_guard<onnxruntime::OrtMutex> l(session_mutex_);
@ -1134,7 +1134,7 @@ common::Status InferenceSession::Initialize() {
  Status status = Status::OK();
  TimePoint tp;
  if (session_profiler_.IsEnabled()) {
-    tp = session_profiler_.Now();
+    tp = session_profiler_.StartTime();
  }

  ORT_TRY {
@ -1612,7 +1612,7 @@ Status InferenceSession::Run(const RunOptions& run_options,
                             const std::vector<OrtDevice>* p_fetches_device_info) {
  TimePoint tp;
  if (session_profiler_.IsEnabled()) {
-    tp = session_profiler_.Now();
+    tp = session_profiler_.StartTime();
  }

 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@ -649,16 +649,11 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
  ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};

-  bool has_kernel_info = false;
  for (size_t i = 1; i < size - 1; ++i) {
    for (auto& s : tags) {
      ASSERT_TRUE(lines[i].find(s) != string::npos);
-      has_kernel_info = has_kernel_info || (lines[i].find("Kernel") != string::npos);
    }
  }
-#ifdef USE_CUDA
-  ASSERT_TRUE(has_kernel_info);
-#endif
 }

 TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
@ -679,23 +674,24 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {

  std::ifstream profile(profile_file);
  std::string line;
-  std::vector<std::string> lines;

-  while (std::getline(profile, line)) {
-    lines.push_back(line);
-  }
-
-  auto size = lines.size();
-  ASSERT_TRUE(size > 1);
-  ASSERT_TRUE(lines[0].find("[") != string::npos);
-  ASSERT_TRUE(lines[1].find("mul_1_fence_before") != string::npos);
-  ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
  std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
-
-  for (size_t i = 1; i < size - 1; ++i) {
-    for (auto& s : tags) {
-      ASSERT_TRUE(lines[i].find(s) != string::npos);
+  int count = 0;
+  while (std::getline(profile, line)) {
+    if (count == 0) {
+      ASSERT_TRUE(line.find("[") != string::npos);
+    } else if (count <= 5) {
+      for (auto& s : tags) {
+        ASSERT_TRUE(line.find(s) != string::npos);
+      }
+    } else {
+      ASSERT_TRUE(line.find("]") != string::npos);
    }
+
+    if (count == 1) {
+      ASSERT_TRUE(line.find("mul_1_fence_before") != string::npos);
+    }
+    count++;
  }
 }

--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@ -595,13 +595,11 @@ class TestInferenceSession(unittest.TestCase):
        tags = ['pid', 'dur', 'ts', 'ph', 'X', 'name', 'args']
        with open(profile_file) as f:
            lines = f.readlines()
-            lines_len = len(lines)
-            self.assertTrue(lines_len > 8)
            self.assertTrue('[' in lines[0])
-            for i in range(1, lines_len-1):
+            for i in range(1, 8):
                for tag in tags:
                    self.assertTrue(tag in lines[i])
-            self.assertTrue(']' in lines[-1])
+            self.assertTrue(']' in lines[8])

    def testProfilerGetStartTimeNs(self):
        def getSingleSessionProfilingStartTime():
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
--- a/tools/ci_build/github/windows/setup_env_cuda_11.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda_11.bat
@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\extras\CUPTI\lib64;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\extras\CUPTI\lib64;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false