diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake index 61af21ab36..b572cc89d6 100644 --- a/cmake/onnxruntime_common.cmake +++ b/cmake/onnxruntime_common.cmake @@ -90,12 +90,6 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_common_src}) onnxruntime_add_static_library(onnxruntime_common ${onnxruntime_common_src}) -if (onnxruntime_USE_CUDA) - target_include_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include) - target_link_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64) - target_link_libraries(onnxruntime_common cupti) -endif() - if (onnxruntime_USE_TELEMETRY) set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h") endif() diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h index 95b6a4aa6d..96c07f09c4 100644 --- a/include/onnxruntime/core/common/logging/logging.h +++ b/include/onnxruntime/core/common/logging/logging.h @@ -55,7 +55,6 @@ namespace profiling { enum EventCategory { SESSION_EVENT = 0, NODE_EVENT, - KERNEL_EVENT, EVENT_CATEGORY_MAX }; @@ -64,8 +63,7 @@ Event descriptions for the above session events. */ static constexpr const char* event_categor_names_[EVENT_CATEGORY_MAX] = { "Session", - "Node", - "Kernel"}; + "Node"}; /* Timing record for all events. diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc index 173208c8b8..ee752a6a62 100644 --- a/onnxruntime/core/common/profiler.cc +++ b/onnxruntime/core/common/profiler.cc @@ -2,148 +2,11 @@ // Licensed under the MIT License. #include "profiler.h" -#include - -#ifdef USE_CUDA -#include -#endif namespace onnxruntime { namespace profiling { using namespace std::chrono; -class DeviceProfiler { - public: - static DeviceProfiler* GetDeviceProfiler(); - virtual void StartProfiling(TimePoint start_time, int pid, int tid) = 0; - virtual std::vector EndProfiling() = 0; - virtual ~DeviceProfiler() = default; -}; - -#ifdef USE_CUDA -#define BUF_SIZE (32 * 1024) -#define ALIGN_SIZE (8) -#define ALIGN_BUFFER(buffer, align) \ - (((uintptr_t)(buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) : (buffer)) -#define DUR(s, e) std::lround(static_cast(e - s) / 1000) - -class CudaProfiler final: public DeviceProfiler { - public: - friend class DeviceProfiler; - ~CudaProfiler() = default; - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler); - void StartProfiling(TimePoint start_time, int pid, int tid) override; - std::vector EndProfiling() override; - private: - CudaProfiler() = default; - static void CUPTIAPI BufferRequested(uint8_t**, size_t*, size_t*); - static void CUPTIAPI BufferCompleted(CUcontext, uint32_t, uint8_t*, size_t, size_t); - struct KernelStat { - std::string name_ = {}; - uint32_t stream_ = 0; - int32_t grid_x_ = 0; - int32_t grid_y_ = 0; - int32_t grid_z_ = 0; - int32_t block_x_ = 0; - int32_t block_y_ = 0; - int32_t block_z_ = 0; - int64_t start_ = 0; - int64_t stop_ = 0; - }; - static OrtMutex mutex_; - static std::vector stats_; - bool initialized_ = false; - TimePoint start_time_; - int pid_ = 0; - int tid_ = 0; - static std::atomic_flag enabled_; -}; - -OrtMutex CudaProfiler::mutex_; -std::vector CudaProfiler::stats_; -std::atomic_flag CudaProfiler::enabled_; - -void CUPTIAPI CudaProfiler::BufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { - uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE); - ORT_ENFORCE(bfr, "Failed to allocate memory for cuda kernel profiling."); - *size = BUF_SIZE; - *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE); - *maxNumRecords = 0; -} - -void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) { - CUptiResult status; - CUpti_Activity* record = NULL; - if (validSize > 0) { - std::unique_lock lock(mutex_); - do { - status = cuptiActivityGetNextRecord(buffer, validSize, &record); - if (status == CUPTI_SUCCESS) { - if (CUPTI_ACTIVITY_KIND_KERNEL == record->kind) { - CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4*)record; - stats_.push_back({kernel->name, kernel->streamId, - kernel->gridX, kernel->gridY, kernel->gridZ, - kernel->blockX, kernel->blockY, kernel->blockZ, - static_cast(kernel->start), - static_cast(kernel->end)}); - } - } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { - break; - } - } while (1); - } - free(buffer); -} - -void CudaProfiler::StartProfiling(TimePoint start_time, int pid, int tid) { - if (!enabled_.test_and_set()) { - start_time_ = start_time; - pid_ = pid; - tid_ = tid; - if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) == CUPTI_SUCCESS && - cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted) == CUPTI_SUCCESS) { - initialized_ = true; - } - } -} - -std::vector CudaProfiler::EndProfiling() { - std::vector events; - if (enabled_.test_and_set()) { - if (initialized_) { - cuptiActivityFlushAll(1); - std::unique_lock lock(mutex_); - int64_t profiling_start = std::chrono::duration_cast(start_time_.time_since_epoch()).count(); - for (const auto& stat : stats_) { - std::initializer_list> args = {{"stream", std::to_string(stat.stream_)}, - {"grid_x", std::to_string(stat.grid_x_)}, - {"grid_y", std::to_string(stat.grid_y_)}, - {"grid_z", std::to_string(stat.grid_z_)}, - {"block_x", std::to_string(stat.block_x_)}, - {"block_y", std::to_string(stat.block_y_)}, - {"block_z", std::to_string(stat.block_z_)}}; - events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, stat.name_, DUR(profiling_start, stat.stop_), DUR(stat.start_, stat.stop_), {args.begin(), args.end()}}); - } - stats_.clear(); - } else { - std::initializer_list> args; - events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, "not_available_due_to_cupti_error", 0, 0, {args.begin(), args.end()}}); - } - } - enabled_.clear(); - return events; -} -#endif //USE_CUDA - -DeviceProfiler* DeviceProfiler::GetDeviceProfiler() { -#ifdef USE_CUDA - static CudaProfiler cuda_profiler; - return &cuda_profiler; -#else - return nullptr; -#endif -} - std::atomic Profiler::global_max_num_events_{1000 * 1000}; #ifdef ENABLE_STATIC_PROFILER_INSTANCE @@ -153,11 +16,10 @@ profiling::Profiler::~Profiler() { instance_ = nullptr; } #else -profiling::Profiler::~Profiler() { -} +profiling::Profiler::~Profiler() {} #endif -::onnxruntime::TimePoint profiling::Profiler::Now() const { +::onnxruntime::TimePoint profiling::Profiler::StartTime() const { ORT_ENFORCE(enabled_); return std::chrono::high_resolution_clock::now(); } @@ -180,11 +42,7 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) { enabled_ = true; profile_with_logger_ = true; custom_logger_ = custom_logger; - profiling_start_time_ = Now(); - DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler(); - if (device_profiler) { - device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId()); - } + profiling_start_time_ = StartTime(); } template @@ -192,11 +50,7 @@ void Profiler::StartProfiling(const std::basic_string& file_name) { enabled_ = true; profile_stream_.open(file_name, std::ios::out | std::ios::trunc); profile_stream_file_ = ToMBString(file_name); - profiling_start_time_ = Now(); - DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler(); - if (device_profiler) { - device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId()); - } + profiling_start_time_ = StartTime(); } template void Profiler::StartProfiling(const std::basic_string& file_name); @@ -204,32 +58,16 @@ template void Profiler::StartProfiling(const std::basic_string& file template void Profiler::StartProfiling(const std::basic_string& file_name); #endif -void Profiler::EndTimeAndRecordEvent(EventCategory category, - const std::string& event_name, - const TimePoint& start_time, const TimePoint& end_time, - const std::initializer_list>& event_args, - bool sync_gpu) { - EndTimeAndRecordEvent(category, event_name, TimeDiffMicroSeconds(start_time, end_time), - TimeDiffMicroSeconds(profiling_start_time_, start_time), event_args, sync_gpu); -} - void Profiler::EndTimeAndRecordEvent(EventCategory category, const std::string& event_name, const TimePoint& start_time, const std::initializer_list>& event_args, - bool sync_gpu) { - EndTimeAndRecordEvent(category, event_name, TimeDiffMicroSeconds(start_time), - TimeDiffMicroSeconds(profiling_start_time_, start_time), event_args, sync_gpu); -} - -void Profiler::EndTimeAndRecordEvent(EventCategory category, - const std::string& event_name, - long long duration, //duration of the op - long long time_from_start, //time difference between op start time and profiler start time - const std::initializer_list>& event_args, bool /*sync_gpu*/) { + long long dur = TimeDiffMicroSeconds(start_time); + long long ts = TimeDiffMicroSeconds(profiling_start_time_, start_time); + EventRecord event(category, logging::GetProcessId(), - logging::GetThreadId(), event_name, time_from_start, duration, {event_args.begin(), event_args.end()}); + logging::GetThreadId(), event_name, ts, dur, {event_args.begin(), event_args.end()}); if (profile_with_logger_) { custom_logger_->SendProfileEvent(event); } else { @@ -263,12 +101,6 @@ std::string Profiler::EndProfiling() { std::lock_guard lock(mutex_); profile_stream_ << "[\n"; - DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler(); - if (device_profiler) { - std::vector device_events = device_profiler->EndProfiling(); - std::copy(device_events.begin(), device_events.end(), std::back_inserter(events_)); - } - for (size_t i = 0; i < events_.size(); ++i) { auto& rec = events_[i]; profile_stream_ << R"({"cat" : ")" << event_categor_names_[rec.cat] << "\","; diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h index 8863a6b367..2ef1a8ac65 100644 --- a/onnxruntime/core/common/profiler.h +++ b/onnxruntime/core/common/profiler.h @@ -51,7 +51,7 @@ class Profiler { /* Produce current time point for any profiling action. */ - TimePoint Now() const; + TimePoint StartTime() const; /* Whether data collection and output from this profiler is enabled. @@ -78,12 +78,6 @@ class Profiler { const std::initializer_list>& event_args = {}, bool sync_gpu = false); - void EndTimeAndRecordEvent(EventCategory category, - const std::string& event_name, - const TimePoint& start_time, const TimePoint& end_time, - const std::initializer_list>& event_args = {}, - bool sync_gpu = false); - /* Write profile data to the given stream in chrome format defined below. https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview# @@ -115,13 +109,6 @@ class Profiler { } private: - void EndTimeAndRecordEvent(EventCategory category, - const std::string& event_name, - long long duration, //duration of the op - long long time_from_start, //time difference between op start time and profiler start time - const std::initializer_list>& event_args, - bool sync_gpu = false); - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Profiler); /** diff --git a/onnxruntime/core/framework/orttraining_partial_executor.cc b/onnxruntime/core/framework/orttraining_partial_executor.cc index 0d9896a1f6..3690f643cc 100644 --- a/onnxruntime/core/framework/orttraining_partial_executor.cc +++ b/onnxruntime/core/framework/orttraining_partial_executor.cc @@ -144,7 +144,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve size_t total_output_sizes = 0; if (is_profiler_enabled) { - tp = session_state.Profiler().Now(); + tp = session_state.Profiler().StartTime(); } ExecutionFrame& frame = state_.GetExecutionFrame(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches, @@ -235,7 +235,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger, false); // TODO: log kernel outputs? if (is_profiler_enabled) { - sync_time_begin = session_state.Profiler().Now(); + sync_time_begin = session_state.Profiler().StartTime(); } // sync before compute @@ -289,7 +289,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve // call compute on the kernel VLOGS(logger, 1) << "Computing kernel: " << node_name_for_profiling; - kernel_begin_time = session_state.Profiler().Now(); + kernel_begin_time = session_state.Profiler().StartTime(); // Calculate total input sizes for this operation. CalculateTotalInputSizes(&op_kernel_context, p_op_kernel, @@ -373,7 +373,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve concurrency::ThreadPool::StopProfiling( session_state.GetThreadPool())}, }); - sync_time_begin = session_state.Profiler().Now(); + sync_time_begin = session_state.Profiler().StartTime(); } // sync after compute for outputs @@ -493,4 +493,4 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve return Status::OK(); } } // namespace onnxruntime -#endif \ No newline at end of file +#endif diff --git a/onnxruntime/core/framework/parallel_executor.cc b/onnxruntime/core/framework/parallel_executor.cc index 129f9d3945..dc2fcd5162 100644 --- a/onnxruntime/core/framework/parallel_executor.cc +++ b/onnxruntime/core/framework/parallel_executor.cc @@ -35,7 +35,7 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v TimePoint tp; const bool is_profiler_enabled = session_state.Profiler().IsEnabled(); if (is_profiler_enabled) { - tp = session_state.Profiler().Now(); + tp = session_state.Profiler().StartTime(); } root_frame_ = std::make_unique(feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches, @@ -118,7 +118,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index, bool keep_running = true; const auto& graph_viewer = session_state.GetGraphViewer(); TimePoint sync_time_begin; - TimePoint kernel_begin_time, kernel_end_time; + TimePoint kernel_begin_time; const bool f_profiler_enabled = session_state.Profiler().IsEnabled(); const SequentialExecutionPlan& exec_plan = *session_state.GetExecutionPlan(); @@ -142,7 +142,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index, OpKernelContextInternal op_kernel_context(session_state, *root_frame_, *p_op_kernel, logger, terminate_flag_); if (f_profiler_enabled) { - sync_time_begin = session_state.Profiler().Now(); + sync_time_begin = session_state.Profiler().StartTime(); } // sync before compute int queue_id = p_op_kernel->KernelDef().ExecQueueId(); @@ -183,7 +183,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index, sync_time_begin, {{"op_name", p_op_kernel->KernelDef().OpName()}}); concurrency::ThreadPool::StartProfiling(session_state.GetThreadPool()); - kernel_begin_time = session_state.Profiler().Now(); + kernel_begin_time = session_state.Profiler().StartTime(); } // call compute on the kernel @@ -216,15 +216,14 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index, } if (f_profiler_enabled) { - kernel_end_time = session_state.Profiler().Now(); session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, node.Name() + "_kernel_time", - kernel_begin_time, kernel_end_time, + kernel_begin_time, {{"op_name", p_op_kernel->KernelDef().OpName()}, {"provider", p_op_kernel->KernelDef().Provider()}, {"thread_scheduling_stats", concurrency::ThreadPool::StopProfiling(session_state.GetThreadPool())}}); - sync_time_begin = session_state.Profiler().Now(); + sync_time_begin = session_state.Profiler().StartTime(); } // sync after compute for outputs if (exec_plan.NodeHasFence(node_index)) { diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index cf0ab6ea3c..9818b937f1 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -130,13 +130,13 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: const bool is_profiler_enabled = session_state.Profiler().IsEnabled(); TimePoint tp; TimePoint sync_time_begin; - TimePoint kernel_begin_time, kernel_end_time; + TimePoint kernel_begin_time; size_t input_activation_sizes = 0; size_t input_parameter_sizes = 0; size_t total_output_sizes = 0; if (is_profiler_enabled) { - tp = session_state.Profiler().Now(); + tp = session_state.Profiler().StartTime(); } ExecutionFrame frame{feed_mlvalue_idxs, feeds, fetch_mlvalue_idxs, fetches, fetch_allocators, session_state}; @@ -235,7 +235,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger, terminate_flag_); // TODO: log kernel outputs? if (is_profiler_enabled) { - sync_time_begin = session_state.Profiler().Now(); + sync_time_begin = session_state.Profiler().StartTime(); } // sync before compute @@ -289,10 +289,11 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: // call compute on the kernel VLOGS(logger, 1) << "Computing kernel: " << node_name_for_profiling; + kernel_begin_time = session_state.Profiler().StartTime(); + // Calculate total input sizes for this operation. CalculateTotalInputSizes(&op_kernel_context, p_op_kernel, input_activation_sizes, input_parameter_sizes, node_name_for_profiling); - kernel_begin_time = session_state.Profiler().Now(); } Status compute_status; @@ -340,7 +341,6 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: } if (is_profiler_enabled) { - kernel_end_time = session_state.Profiler().Now(); // Calculate total output sizes for this operation. CalculateTotalOutputSizes(&op_kernel_context, total_output_sizes, node_name_for_profiling); @@ -356,9 +356,10 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: << " Output_Size=" << total_output_sizes << "\n"; #endif + session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, node_name_for_profiling + "_kernel_time", - kernel_begin_time, kernel_end_time, + kernel_begin_time, // Log additional operation args / info. { {"op_name", p_op_kernel->KernelDef().OpName()}, @@ -370,7 +371,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: {"output_size", std::to_string(total_output_sizes)}, {"thread_scheduling_stats", concurrency::ThreadPool::StopProfiling(session_state.GetThreadPool())}, }); - sync_time_begin = session_state.Profiler().Now(); + sync_time_begin = session_state.Profiler().StartTime(); } // sync after compute for outputs diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index bd7632bbab..09bb97781d 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -564,7 +564,7 @@ common::Status InferenceSession::Load(std::function l(session_mutex_); @@ -1134,7 +1134,7 @@ common::Status InferenceSession::Initialize() { Status status = Status::OK(); TimePoint tp; if (session_profiler_.IsEnabled()) { - tp = session_profiler_.Now(); + tp = session_profiler_.StartTime(); } ORT_TRY { @@ -1612,7 +1612,7 @@ Status InferenceSession::Run(const RunOptions& run_options, const std::vector* p_fetches_device_info) { TimePoint tp; if (session_profiler_.IsEnabled()) { - tp = session_profiler_.Now(); + tp = session_profiler_.StartTime(); } #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 5ed111d877..9fb9f23b4a 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -649,16 +649,11 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) { ASSERT_TRUE(lines[size - 1].find("]") != string::npos); std::vector tags = {"pid", "dur", "ts", "ph", "X", "name", "args"}; - bool has_kernel_info = false; for (size_t i = 1; i < size - 1; ++i) { for (auto& s : tags) { ASSERT_TRUE(lines[i].find(s) != string::npos); - has_kernel_info = has_kernel_info || (lines[i].find("Kernel") != string::npos); } } -#ifdef USE_CUDA - ASSERT_TRUE(has_kernel_info); -#endif } TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) { @@ -679,23 +674,24 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) { std::ifstream profile(profile_file); std::string line; - std::vector lines; - while (std::getline(profile, line)) { - lines.push_back(line); - } - - auto size = lines.size(); - ASSERT_TRUE(size > 1); - ASSERT_TRUE(lines[0].find("[") != string::npos); - ASSERT_TRUE(lines[1].find("mul_1_fence_before") != string::npos); - ASSERT_TRUE(lines[size - 1].find("]") != string::npos); std::vector tags = {"pid", "dur", "ts", "ph", "X", "name", "args"}; - - for (size_t i = 1; i < size - 1; ++i) { - for (auto& s : tags) { - ASSERT_TRUE(lines[i].find(s) != string::npos); + int count = 0; + while (std::getline(profile, line)) { + if (count == 0) { + ASSERT_TRUE(line.find("[") != string::npos); + } else if (count <= 5) { + for (auto& s : tags) { + ASSERT_TRUE(line.find(s) != string::npos); + } + } else { + ASSERT_TRUE(line.find("]") != string::npos); } + + if (count == 1) { + ASSERT_TRUE(line.find("mul_1_fence_before") != string::npos); + } + count++; } } diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index a9f45f82fa..15751d3166 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -595,13 +595,11 @@ class TestInferenceSession(unittest.TestCase): tags = ['pid', 'dur', 'ts', 'ph', 'X', 'name', 'args'] with open(profile_file) as f: lines = f.readlines() - lines_len = len(lines) - self.assertTrue(lines_len > 8) self.assertTrue('[' in lines[0]) - for i in range(1, lines_len-1): + for i in range(1, 8): for tag in tags: self.assertTrue(tag in lines[i]) - self.assertTrue(']' in lines[-1]) + self.assertTrue(']' in lines[8]) def testProfilerGetStartTimeNs(self): def getSingleSessionProfilingStartTime(): diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat index b89dbb16db..9e51d68256 100644 --- a/tools/ci_build/github/windows/setup_env_cuda.bat +++ b/tools/ci_build/github/windows/setup_env_cuda.bat @@ -1,2 +1,2 @@ -set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH% +set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false diff --git a/tools/ci_build/github/windows/setup_env_cuda_11.bat b/tools/ci_build/github/windows/setup_env_cuda_11.bat index a10f5d7b68..1ab8e7ee73 100644 --- a/tools/ci_build/github/windows/setup_env_cuda_11.bat +++ b/tools/ci_build/github/windows/setup_env_cuda_11.bat @@ -1,2 +1,2 @@ -set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\extras\CUPTI\lib64;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH% +set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 41524a3465..f5ff7efb45 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -1,2 +1,2 @@ -set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\extras\CUPTI\lib64;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH% +set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false