diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 61af21ab36..b572cc89d6 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -90,12 +90,6 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_common_src})
 
 onnxruntime_add_static_library(onnxruntime_common ${onnxruntime_common_src})
 
-if (onnxruntime_USE_CUDA)
-  target_include_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
-  target_link_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
-  target_link_libraries(onnxruntime_common cupti)
-endif()
-
 if (onnxruntime_USE_TELEMETRY)
   set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
 endif()
diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index 95b6a4aa6d..96c07f09c4 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -55,7 +55,6 @@ namespace profiling {
 enum EventCategory {
   SESSION_EVENT = 0,
   NODE_EVENT,
-  KERNEL_EVENT,
   EVENT_CATEGORY_MAX
 };
 
@@ -64,8 +63,7 @@ Event descriptions for the above session events.
 */
 static constexpr const char* event_categor_names_[EVENT_CATEGORY_MAX] = {
     "Session",
-    "Node",
-    "Kernel"};
+    "Node"};
 
 /*
 Timing record for all events.
diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc
index 173208c8b8..ee752a6a62 100644
--- a/onnxruntime/core/common/profiler.cc
+++ b/onnxruntime/core/common/profiler.cc
@@ -2,148 +2,11 @@
 // Licensed under the MIT License.
 
 #include "profiler.h"
-#include <cmath>
-
-#ifdef USE_CUDA
-#include <cupti.h>
-#endif
 
 namespace onnxruntime {
 namespace profiling {
 using namespace std::chrono;
 
-class DeviceProfiler {
- public:
-  static DeviceProfiler* GetDeviceProfiler();
-  virtual void StartProfiling(TimePoint start_time, int pid, int tid) = 0;
-  virtual std::vector<EventRecord> EndProfiling() = 0;
-  virtual ~DeviceProfiler() = default;
-};
-
-#ifdef USE_CUDA
-#define BUF_SIZE (32 * 1024)
-#define ALIGN_SIZE (8)
-#define ALIGN_BUFFER(buffer, align) \
-  (((uintptr_t)(buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) : (buffer))
-#define DUR(s, e) std::lround(static_cast<double>(e - s) / 1000)
-
-class CudaProfiler final: public DeviceProfiler {
- public:
-  friend class DeviceProfiler;
-  ~CudaProfiler() = default;
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
-  void StartProfiling(TimePoint start_time, int pid, int tid) override;
-  std::vector<EventRecord> EndProfiling() override;
- private:
-  CudaProfiler() = default;
-  static void CUPTIAPI BufferRequested(uint8_t**, size_t*, size_t*);
-  static void CUPTIAPI BufferCompleted(CUcontext, uint32_t, uint8_t*, size_t, size_t);
-  struct KernelStat {
-    std::string name_ = {};
-    uint32_t stream_ = 0;
-    int32_t grid_x_ = 0;
-    int32_t grid_y_ = 0;
-    int32_t grid_z_ = 0;
-    int32_t block_x_ = 0;
-    int32_t block_y_ = 0;
-    int32_t block_z_ = 0;
-    int64_t start_ = 0;
-    int64_t stop_ = 0;
-  };
-  static OrtMutex mutex_;
-  static std::vector<KernelStat> stats_;
-  bool initialized_ = false;
-  TimePoint start_time_;
-  int pid_ = 0;
-  int tid_ = 0;
-  static std::atomic_flag enabled_;
-};
-
-OrtMutex CudaProfiler::mutex_;
-std::vector<CudaProfiler::KernelStat> CudaProfiler::stats_;
-std::atomic_flag CudaProfiler::enabled_;
-
-void CUPTIAPI CudaProfiler::BufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) {
-  uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE);
-  ORT_ENFORCE(bfr, "Failed to allocate memory for cuda kernel profiling.");
-  *size = BUF_SIZE;
-  *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
-  *maxNumRecords = 0;
-}
-
-void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) {
-  CUptiResult status;
-  CUpti_Activity* record = NULL;
-  if (validSize > 0) {
-    std::unique_lock<OrtMutex> lock(mutex_);
-    do {
-      status = cuptiActivityGetNextRecord(buffer, validSize, &record);
-      if (status == CUPTI_SUCCESS) {
-        if (CUPTI_ACTIVITY_KIND_KERNEL == record->kind) {
-          CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4*)record;
-          stats_.push_back({kernel->name, kernel->streamId,
-                            kernel->gridX, kernel->gridY, kernel->gridZ,
-                            kernel->blockX, kernel->blockY, kernel->blockZ,
-                            static_cast<int64_t>(kernel->start),
-                            static_cast<int64_t>(kernel->end)});
-        }
-      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-        break;
-      }
-    } while (1);
-  }
-  free(buffer);
-}
-
-void CudaProfiler::StartProfiling(TimePoint start_time, int pid, int tid) {
-  if (!enabled_.test_and_set()) {
-    start_time_ = start_time;
-    pid_ = pid;
-    tid_ = tid;
-    if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) == CUPTI_SUCCESS &&
-        cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted) == CUPTI_SUCCESS) {
-      initialized_ = true;
-    }
-  }
-}
-
-std::vector<EventRecord> CudaProfiler::EndProfiling() {
-  std::vector<EventRecord> events;
-  if (enabled_.test_and_set()) {
-    if (initialized_) {
-      cuptiActivityFlushAll(1);
-      std::unique_lock<OrtMutex> lock(mutex_);
-      int64_t profiling_start = std::chrono::duration_cast<nanoseconds>(start_time_.time_since_epoch()).count();
-      for (const auto& stat : stats_) {
-        std::initializer_list<std::pair<std::string, std::string>> args = {{"stream", std::to_string(stat.stream_)},
-                                                                           {"grid_x", std::to_string(stat.grid_x_)},
-                                                                           {"grid_y", std::to_string(stat.grid_y_)},
-                                                                           {"grid_z", std::to_string(stat.grid_z_)},
-                                                                           {"block_x", std::to_string(stat.block_x_)},
-                                                                           {"block_y", std::to_string(stat.block_y_)},
-                                                                           {"block_z", std::to_string(stat.block_z_)}};
-        events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, stat.name_, DUR(profiling_start, stat.stop_), DUR(stat.start_, stat.stop_), {args.begin(), args.end()}});
-      }
-      stats_.clear();
-    } else {
-      std::initializer_list<std::pair<std::string, std::string>> args;
-      events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, "not_available_due_to_cupti_error", 0, 0, {args.begin(), args.end()}});
-    }
-  }
-  enabled_.clear();
-  return events;
-}
-#endif //USE_CUDA
-
-DeviceProfiler* DeviceProfiler::GetDeviceProfiler() {
-#ifdef USE_CUDA
-  static CudaProfiler cuda_profiler;
-  return &cuda_profiler;
-#else
-  return nullptr;
-#endif
-}
-
 std::atomic<size_t> Profiler::global_max_num_events_{1000 * 1000};
 
 #ifdef ENABLE_STATIC_PROFILER_INSTANCE
@@ -153,11 +16,10 @@ profiling::Profiler::~Profiler() {
   instance_ = nullptr;
 }
 #else
-profiling::Profiler::~Profiler() {
-}
+profiling::Profiler::~Profiler() {}
 #endif
 
-::onnxruntime::TimePoint profiling::Profiler::Now() const {
+::onnxruntime::TimePoint profiling::Profiler::StartTime() const {
   ORT_ENFORCE(enabled_);
   return std::chrono::high_resolution_clock::now();
 }
@@ -180,11 +42,7 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) {
   enabled_ = true;
   profile_with_logger_ = true;
   custom_logger_ = custom_logger;
-  profiling_start_time_ = Now();
-  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
-  if (device_profiler) {
-    device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
-  }
+  profiling_start_time_ = StartTime();
 }
 
 template <typename T>
@@ -192,11 +50,7 @@ void Profiler::StartProfiling(const std::basic_string<T>& file_name) {
   enabled_ = true;
   profile_stream_.open(file_name, std::ios::out | std::ios::trunc);
   profile_stream_file_ = ToMBString(file_name);
-  profiling_start_time_ = Now();
-  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
-  if (device_profiler) {
-    device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
-  }
+  profiling_start_time_ = StartTime();
 }
 
 template void Profiler::StartProfiling<char>(const std::basic_string<char>& file_name);
@@ -204,32 +58,16 @@ template void Profiler::StartProfiling<char>(const std::basic_string<char>& file
 template void Profiler::StartProfiling<wchar_t>(const std::basic_string<wchar_t>& file_name);
 #endif
 
-void Profiler::EndTimeAndRecordEvent(EventCategory category,
-                                     const std::string& event_name,
-                                     const TimePoint& start_time, const TimePoint& end_time,
-                                     const std::initializer_list<std::pair<std::string, std::string>>& event_args,
-                                     bool sync_gpu) {
-  EndTimeAndRecordEvent(category, event_name, TimeDiffMicroSeconds(start_time, end_time),
-                        TimeDiffMicroSeconds(profiling_start_time_, start_time), event_args, sync_gpu);
-}
-
 void Profiler::EndTimeAndRecordEvent(EventCategory category,
                                      const std::string& event_name,
                                      const TimePoint& start_time,
                                      const std::initializer_list<std::pair<std::string, std::string>>& event_args,
-                                     bool sync_gpu) {
-  EndTimeAndRecordEvent(category, event_name, TimeDiffMicroSeconds(start_time),
-                        TimeDiffMicroSeconds(profiling_start_time_, start_time), event_args, sync_gpu);
-}
-
-void Profiler::EndTimeAndRecordEvent(EventCategory category,
-                                     const std::string& event_name,
-                                     long long duration,         //duration of the op
-                                     long long time_from_start,  //time difference between op start time and profiler start time
-                                     const std::initializer_list<std::pair<std::string, std::string>>& event_args,
                                      bool /*sync_gpu*/) {
+  long long dur = TimeDiffMicroSeconds(start_time);
+  long long ts = TimeDiffMicroSeconds(profiling_start_time_, start_time);
+
   EventRecord event(category, logging::GetProcessId(),
-                    logging::GetThreadId(), event_name, time_from_start, duration, {event_args.begin(), event_args.end()});
+                    logging::GetThreadId(), event_name, ts, dur, {event_args.begin(), event_args.end()});
   if (profile_with_logger_) {
     custom_logger_->SendProfileEvent(event);
   } else {
@@ -263,12 +101,6 @@ std::string Profiler::EndProfiling() {
   std::lock_guard<OrtMutex> lock(mutex_);
   profile_stream_ << "[\n";
 
-  DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
-  if (device_profiler) {
-    std::vector<EventRecord> device_events = device_profiler->EndProfiling();
-    std::copy(device_events.begin(), device_events.end(), std::back_inserter(events_));
-  }
-
   for (size_t i = 0; i < events_.size(); ++i) {
     auto& rec = events_[i];
     profile_stream_ << R"({"cat" : ")" << event_categor_names_[rec.cat] << "\",";
diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h
index 8863a6b367..2ef1a8ac65 100644
--- a/onnxruntime/core/common/profiler.h
+++ b/onnxruntime/core/common/profiler.h
@@ -51,7 +51,7 @@ class Profiler {
   /*
   Produce current time point for any profiling action.
   */
-  TimePoint Now() const;
+  TimePoint StartTime() const;
 
   /*
   Whether data collection and output from this profiler is enabled.
@@ -78,12 +78,6 @@ class Profiler {
                              const std::initializer_list<std::pair<std::string, std::string>>& event_args = {},
                              bool sync_gpu = false);
 
-  void EndTimeAndRecordEvent(EventCategory category,
-                             const std::string& event_name,
-                             const TimePoint& start_time, const TimePoint& end_time,
-                             const std::initializer_list<std::pair<std::string, std::string>>& event_args = {},
-                             bool sync_gpu = false);
-
   /*
   Write profile data to the given stream in chrome format defined below.
   https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview#
@@ -115,13 +109,6 @@ class Profiler {
   }
 
  private:
-  void EndTimeAndRecordEvent(EventCategory category,
-                             const std::string& event_name,
-                             long long duration,         //duration of the op
-                             long long time_from_start,  //time difference between op start time and profiler start time
-                             const std::initializer_list<std::pair<std::string, std::string>>& event_args,
-                             bool sync_gpu = false);
-
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Profiler);
 
   /**
diff --git a/onnxruntime/core/framework/orttraining_partial_executor.cc b/onnxruntime/core/framework/orttraining_partial_executor.cc
index 0d9896a1f6..3690f643cc 100644
--- a/onnxruntime/core/framework/orttraining_partial_executor.cc
+++ b/onnxruntime/core/framework/orttraining_partial_executor.cc
@@ -144,7 +144,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
   size_t total_output_sizes = 0;
 
   if (is_profiler_enabled) {
-    tp = session_state.Profiler().Now();
+    tp = session_state.Profiler().StartTime();
   }
 
   ExecutionFrame& frame = state_.GetExecutionFrame(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches,
@@ -235,7 +235,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
     OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger, false);
     // TODO: log kernel outputs?
     if (is_profiler_enabled) {
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
     }
 
     // sync before compute
@@ -289,7 +289,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
       // call compute on the kernel
       VLOGS(logger, 1) << "Computing kernel: " << node_name_for_profiling;
 
-      kernel_begin_time = session_state.Profiler().Now();
+      kernel_begin_time = session_state.Profiler().StartTime();
 
       // Calculate total input sizes for this operation.
       CalculateTotalInputSizes(&op_kernel_context, p_op_kernel,
@@ -373,7 +373,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
                                                           concurrency::ThreadPool::StopProfiling(
                                                               session_state.GetThreadPool())},
                                                      });
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
     }
 
     // sync after compute for outputs
@@ -493,4 +493,4 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
   return Status::OK();
 }
 }  // namespace onnxruntime
-#endif
\ No newline at end of file
+#endif
diff --git a/onnxruntime/core/framework/parallel_executor.cc b/onnxruntime/core/framework/parallel_executor.cc
index 129f9d3945..dc2fcd5162 100644
--- a/onnxruntime/core/framework/parallel_executor.cc
+++ b/onnxruntime/core/framework/parallel_executor.cc
@@ -35,7 +35,7 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v
   TimePoint tp;
   const bool is_profiler_enabled = session_state.Profiler().IsEnabled();
   if (is_profiler_enabled) {
-    tp = session_state.Profiler().Now();
+    tp = session_state.Profiler().StartTime();
   }
 
   root_frame_ = std::make_unique<ExecutionFrame>(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches,
@@ -118,7 +118,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
   bool keep_running = true;
   const auto& graph_viewer = session_state.GetGraphViewer();
   TimePoint sync_time_begin;
-  TimePoint kernel_begin_time, kernel_end_time;
+  TimePoint kernel_begin_time;
   const bool f_profiler_enabled = session_state.Profiler().IsEnabled();
   const SequentialExecutionPlan& exec_plan = *session_state.GetExecutionPlan();
 
@@ -142,7 +142,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
     OpKernelContextInternal op_kernel_context(session_state, *root_frame_, *p_op_kernel, logger, terminate_flag_);
 
     if (f_profiler_enabled) {
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
     }
     // sync before compute
     int queue_id = p_op_kernel->KernelDef().ExecQueueId();
@@ -183,7 +183,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
                                                      sync_time_begin,
                                                      {{"op_name", p_op_kernel->KernelDef().OpName()}});
       concurrency::ThreadPool::StartProfiling(session_state.GetThreadPool());
-      kernel_begin_time = session_state.Profiler().Now();
+      kernel_begin_time = session_state.Profiler().StartTime();
     }
 
     // call compute on the kernel
@@ -216,15 +216,14 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
     }
 
     if (f_profiler_enabled) {
-      kernel_end_time = session_state.Profiler().Now();
       session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
                                                      node.Name() + "_kernel_time",
-                                                     kernel_begin_time, kernel_end_time,
+                                                     kernel_begin_time,
                                                      {{"op_name", p_op_kernel->KernelDef().OpName()},
                                                       {"provider", p_op_kernel->KernelDef().Provider()},
                                                       {"thread_scheduling_stats", concurrency::ThreadPool::StopProfiling(session_state.GetThreadPool())}});
 
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
     }
     // sync after compute for outputs
     if (exec_plan.NodeHasFence(node_index)) {
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index cf0ab6ea3c..9818b937f1 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -130,13 +130,13 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
   const bool is_profiler_enabled = session_state.Profiler().IsEnabled();
   TimePoint tp;
   TimePoint sync_time_begin;
-  TimePoint kernel_begin_time, kernel_end_time;
+  TimePoint kernel_begin_time;
   size_t input_activation_sizes = 0;
   size_t input_parameter_sizes = 0;
   size_t total_output_sizes = 0;
 
   if (is_profiler_enabled) {
-    tp = session_state.Profiler().Now();
+    tp = session_state.Profiler().StartTime();
   }
 
   ExecutionFrame frame{feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches, fetch_allocators, session_state};
@@ -235,7 +235,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
     OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger, terminate_flag_);
     // TODO: log kernel outputs?
     if (is_profiler_enabled) {
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
     }
 
     // sync before compute
@@ -289,10 +289,11 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
       // call compute on the kernel
       VLOGS(logger, 1) << "Computing kernel: " << node_name_for_profiling;
 
+      kernel_begin_time = session_state.Profiler().StartTime();
+
       // Calculate total input sizes for this operation.
       CalculateTotalInputSizes(&op_kernel_context, p_op_kernel,
                                input_activation_sizes, input_parameter_sizes, node_name_for_profiling);
-      kernel_begin_time = session_state.Profiler().Now();
     }
 
     Status compute_status;
@@ -340,7 +341,6 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
     }
 
     if (is_profiler_enabled) {
-      kernel_end_time = session_state.Profiler().Now();
       // Calculate total output sizes for this operation.
       CalculateTotalOutputSizes(&op_kernel_context, total_output_sizes, node_name_for_profiling);
 
@@ -356,9 +356,10 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
                 << " Output_Size=" << total_output_sizes
                 << "\n";
 #endif
+
       session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
                                                      node_name_for_profiling + "_kernel_time",
-                                                     kernel_begin_time, kernel_end_time,
+                                                     kernel_begin_time,
                                                      // Log additional operation args / info.
                                                      {
                                                          {"op_name", p_op_kernel->KernelDef().OpName()},
@@ -370,7 +371,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
                                                          {"output_size", std::to_string(total_output_sizes)},
                                                          {"thread_scheduling_stats", concurrency::ThreadPool::StopProfiling(session_state.GetThreadPool())},
                                                      });
-      sync_time_begin = session_state.Profiler().Now();
+      sync_time_begin = session_state.Profiler().StartTime();
     }
 
     // sync after compute for outputs
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index bd7632bbab..09bb97781d 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -564,7 +564,7 @@ common::Status InferenceSession::Load(std::function<common::Status(std::shared_p
   Status status = Status::OK();
   TimePoint tp;
   if (session_profiler_.IsEnabled()) {
-    tp = session_profiler_.Now();
+    tp = session_profiler_.StartTime();
   }
   ORT_TRY {
     std::lock_guard<onnxruntime::OrtMutex> l(session_mutex_);
@@ -1134,7 +1134,7 @@ common::Status InferenceSession::Initialize() {
   Status status = Status::OK();
   TimePoint tp;
   if (session_profiler_.IsEnabled()) {
-    tp = session_profiler_.Now();
+    tp = session_profiler_.StartTime();
   }
 
   ORT_TRY {
@@ -1612,7 +1612,7 @@ Status InferenceSession::Run(const RunOptions& run_options,
                              const std::vector<OrtDevice>* p_fetches_device_info) {
   TimePoint tp;
   if (session_profiler_.IsEnabled()) {
-    tp = session_profiler_.Now();
+    tp = session_profiler_.StartTime();
   }
 
 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 5ed111d877..9fb9f23b4a 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -649,16 +649,11 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
   ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
   std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
 
-  bool has_kernel_info = false;
   for (size_t i = 1; i < size - 1; ++i) {
     for (auto& s : tags) {
       ASSERT_TRUE(lines[i].find(s) != string::npos);
-      has_kernel_info = has_kernel_info || (lines[i].find("Kernel") != string::npos);
     }
   }
-#ifdef USE_CUDA
-  ASSERT_TRUE(has_kernel_info);
-#endif
 }
 
 TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
@@ -679,23 +674,24 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
 
   std::ifstream profile(profile_file);
   std::string line;
-  std::vector<std::string> lines;
 
-  while (std::getline(profile, line)) {
-    lines.push_back(line);
-  }
-
-  auto size = lines.size();
-  ASSERT_TRUE(size > 1);
-  ASSERT_TRUE(lines[0].find("[") != string::npos);
-  ASSERT_TRUE(lines[1].find("mul_1_fence_before") != string::npos);
-  ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
   std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
-
-  for (size_t i = 1; i < size - 1; ++i) {
-    for (auto& s : tags) {
-      ASSERT_TRUE(lines[i].find(s) != string::npos);
+  int count = 0;
+  while (std::getline(profile, line)) {
+    if (count == 0) {
+      ASSERT_TRUE(line.find("[") != string::npos);
+    } else if (count <= 5) {
+      for (auto& s : tags) {
+        ASSERT_TRUE(line.find(s) != string::npos);
+      }
+    } else {
+      ASSERT_TRUE(line.find("]") != string::npos);
     }
+
+    if (count == 1) {
+      ASSERT_TRUE(line.find("mul_1_fence_before") != string::npos);
+    }
+    count++;
   }
 }
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index a9f45f82fa..15751d3166 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -595,13 +595,11 @@ class TestInferenceSession(unittest.TestCase):
         tags = ['pid', 'dur', 'ts', 'ph', 'X', 'name', 'args']
         with open(profile_file) as f:
             lines = f.readlines()
-            lines_len = len(lines)
-            self.assertTrue(lines_len > 8)
             self.assertTrue('[' in lines[0])
-            for i in range(1, lines_len-1):
+            for i in range(1, 8):
                 for tag in tags:
                     self.assertTrue(tag in lines[i])
-            self.assertTrue(']' in lines[-1])
+            self.assertTrue(']' in lines[8])
 
     def testProfilerGetStartTimeNs(self):
         def getSingleSessionProfilingStartTime():
diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
index b89dbb16db..9e51d68256 100644
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
diff --git a/tools/ci_build/github/windows/setup_env_cuda_11.bat b/tools/ci_build/github/windows/setup_env_cuda_11.bat
index a10f5d7b68..1ab8e7ee73 100644
--- a/tools/ci_build/github/windows/setup_env_cuda_11.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda_11.bat
@@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\extras\CUPTI\lib64;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 41524a3465..f5ff7efb45 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -1,2 +1,2 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\extras\CUPTI\lib64;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
+set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false