mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-14 20:48:00 +00:00
Cuda Profiler (#7110)
* implement cuda profiler * add counters * downgrade cupti kernel version * move mutex * add cupti to path * fix win gpu build err * add path for cuda10 * fix linux com err * extend include path * add init flag * fix test case * fix tensorrt pipeline * add UT Co-authored-by: Ubuntu <randysheriff@rashuai-linux-gpu-3.3cfnmjowvu4e5bidlsmcxsmzwg.xx.internal.cloudapp.net>
This commit is contained in:
parent
b22e60bd44
commit
aeca7c2940
8 changed files with 207 additions and 37 deletions
|
|
@ -79,6 +79,12 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_common_src})
|
|||
|
||||
add_library(onnxruntime_common ${onnxruntime_common_src})
|
||||
|
||||
if (onnxruntime_USE_CUDA)
|
||||
target_include_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/include ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
|
||||
target_link_directories(onnxruntime_common PUBLIC ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
|
||||
target_link_libraries(onnxruntime_common cupti)
|
||||
endif()
|
||||
|
||||
if (onnxruntime_USE_TELEMETRY)
|
||||
set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ namespace profiling {
|
|||
enum EventCategory {
|
||||
SESSION_EVENT = 0,
|
||||
NODE_EVENT,
|
||||
KERNEL_EVENT,
|
||||
EVENT_CATEGORY_MAX
|
||||
};
|
||||
|
||||
|
|
@ -63,7 +64,8 @@ Event descriptions for the above session events.
|
|||
*/
|
||||
static constexpr const char* event_categor_names_[EVENT_CATEGORY_MAX] = {
|
||||
"Session",
|
||||
"Node"};
|
||||
"Node",
|
||||
"Kernel"};
|
||||
|
||||
/*
|
||||
Timing record for all events.
|
||||
|
|
|
|||
|
|
@ -2,11 +2,148 @@
|
|||
// Licensed under the MIT License.
|
||||
|
||||
#include "profiler.h"
|
||||
#include <cmath>
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include <cupti.h>
|
||||
#endif
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace profiling {
|
||||
using namespace std::chrono;
|
||||
|
||||
class DeviceProfiler {
|
||||
public:
|
||||
static DeviceProfiler* GetDeviceProfiler();
|
||||
virtual void StartProfiling(TimePoint start_time, int pid, int tid) = 0;
|
||||
virtual std::vector<EventRecord> EndProfiling() = 0;
|
||||
virtual ~DeviceProfiler() = default;
|
||||
};
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#define BUF_SIZE (32 * 1024)
|
||||
#define ALIGN_SIZE (8)
|
||||
#define ALIGN_BUFFER(buffer, align) \
|
||||
(((uintptr_t)(buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) : (buffer))
|
||||
#define DUR(s, e) std::lround(static_cast<double>(e - s) / 1000)
|
||||
|
||||
class CudaProfiler final: public DeviceProfiler {
|
||||
public:
|
||||
friend class DeviceProfiler;
|
||||
~CudaProfiler() = default;
|
||||
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
|
||||
void StartProfiling(TimePoint start_time, int pid, int tid) override;
|
||||
std::vector<EventRecord> EndProfiling() override;
|
||||
private:
|
||||
CudaProfiler() = default;
|
||||
static void CUPTIAPI BufferRequested(uint8_t**, size_t*, size_t*);
|
||||
static void CUPTIAPI BufferCompleted(CUcontext, uint32_t, uint8_t*, size_t, size_t);
|
||||
struct KernelStat {
|
||||
std::string name_ = {};
|
||||
uint32_t stream_ = 0;
|
||||
int32_t grid_x_ = 0;
|
||||
int32_t grid_y_ = 0;
|
||||
int32_t grid_z_ = 0;
|
||||
int32_t block_x_ = 0;
|
||||
int32_t block_y_ = 0;
|
||||
int32_t block_z_ = 0;
|
||||
int64_t start_ = 0;
|
||||
int64_t stop_ = 0;
|
||||
};
|
||||
static OrtMutex mutex_;
|
||||
static std::vector<KernelStat> stats_;
|
||||
bool initialized_ = false;
|
||||
TimePoint start_time_;
|
||||
int pid_ = 0;
|
||||
int tid_ = 0;
|
||||
static std::atomic_flag enabled_;
|
||||
};
|
||||
|
||||
OrtMutex CudaProfiler::mutex_;
|
||||
std::vector<CudaProfiler::KernelStat> CudaProfiler::stats_;
|
||||
std::atomic_flag CudaProfiler::enabled_;
|
||||
|
||||
void CUPTIAPI CudaProfiler::BufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) {
|
||||
uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE);
|
||||
ORT_ENFORCE(bfr, "Failed to allocate memory for cuda kernel profiling.");
|
||||
*size = BUF_SIZE;
|
||||
*buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
|
||||
*maxNumRecords = 0;
|
||||
}
|
||||
|
||||
void CUPTIAPI CudaProfiler::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer, size_t, size_t validSize) {
|
||||
CUptiResult status;
|
||||
CUpti_Activity* record = NULL;
|
||||
if (validSize > 0) {
|
||||
std::unique_lock<OrtMutex> lock(mutex_);
|
||||
do {
|
||||
status = cuptiActivityGetNextRecord(buffer, validSize, &record);
|
||||
if (status == CUPTI_SUCCESS) {
|
||||
if (CUPTI_ACTIVITY_KIND_KERNEL == record->kind) {
|
||||
CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4*)record;
|
||||
stats_.push_back({kernel->name, kernel->streamId,
|
||||
kernel->gridX, kernel->gridY, kernel->gridZ,
|
||||
kernel->blockX, kernel->blockY, kernel->blockZ,
|
||||
static_cast<int64_t>(kernel->start),
|
||||
static_cast<int64_t>(kernel->end)});
|
||||
}
|
||||
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
}
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
void CudaProfiler::StartProfiling(TimePoint start_time, int pid, int tid) {
|
||||
if (!enabled_.test_and_set()) {
|
||||
start_time_ = start_time;
|
||||
pid_ = pid;
|
||||
tid_ = tid;
|
||||
if (cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) == CUPTI_SUCCESS &&
|
||||
cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted) == CUPTI_SUCCESS) {
|
||||
initialized_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<EventRecord> CudaProfiler::EndProfiling() {
|
||||
std::vector<EventRecord> events;
|
||||
if (enabled_.test_and_set()) {
|
||||
if (initialized_) {
|
||||
cuptiActivityFlushAll(1);
|
||||
std::unique_lock<OrtMutex> lock(mutex_);
|
||||
int64_t profiling_start = std::chrono::duration_cast<nanoseconds>(start_time_.time_since_epoch()).count();
|
||||
for (const auto& stat : stats_) {
|
||||
std::initializer_list<std::pair<std::string, std::string>> args = {{"stream", std::to_string(stat.stream_)},
|
||||
{"grid_x", std::to_string(stat.grid_x_)},
|
||||
{"grid_y", std::to_string(stat.grid_y_)},
|
||||
{"grid_z", std::to_string(stat.grid_z_)},
|
||||
{"block_x", std::to_string(stat.block_x_)},
|
||||
{"block_y", std::to_string(stat.block_y_)},
|
||||
{"block_z", std::to_string(stat.block_z_)}};
|
||||
events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, stat.name_, DUR(profiling_start, stat.stop_), DUR(stat.start_, stat.stop_), {args.begin(), args.end()}});
|
||||
}
|
||||
stats_.clear();
|
||||
} else {
|
||||
std::initializer_list<std::pair<std::string, std::string>> args;
|
||||
events.push_back({EventCategory::KERNEL_EVENT, pid_, tid_, "not_available_due_to_cupti_error", 0, 0, {args.begin(), args.end()}});
|
||||
}
|
||||
}
|
||||
enabled_.clear();
|
||||
return events;
|
||||
}
|
||||
#endif //USE_CUDA
|
||||
|
||||
DeviceProfiler* DeviceProfiler::GetDeviceProfiler() {
|
||||
#ifdef USE_CUDA
|
||||
static CudaProfiler cuda_profiler;
|
||||
return &cuda_profiler;
|
||||
#else
|
||||
return nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
std::atomic<size_t> Profiler::global_max_num_events_{1000 * 1000};
|
||||
|
||||
#ifdef ENABLE_STATIC_PROFILER_INSTANCE
|
||||
|
|
@ -16,7 +153,8 @@ profiling::Profiler::~Profiler() {
|
|||
instance_ = nullptr;
|
||||
}
|
||||
#else
|
||||
profiling::Profiler::~Profiler() {}
|
||||
profiling::Profiler::~Profiler() {
|
||||
}
|
||||
#endif
|
||||
|
||||
::onnxruntime::TimePoint profiling::Profiler::StartTime() const {
|
||||
|
|
@ -43,6 +181,10 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) {
|
|||
profile_with_logger_ = true;
|
||||
custom_logger_ = custom_logger;
|
||||
profiling_start_time_ = StartTime();
|
||||
DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
|
||||
if (device_profiler) {
|
||||
device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -51,6 +193,10 @@ void Profiler::StartProfiling(const std::basic_string<T>& file_name) {
|
|||
profile_stream_.open(file_name, std::ios::out | std::ios::trunc);
|
||||
profile_stream_file_ = ToMBString(file_name);
|
||||
profiling_start_time_ = StartTime();
|
||||
DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
|
||||
if (device_profiler) {
|
||||
device_profiler->StartProfiling(profiling_start_time_, logging::GetProcessId(), logging::GetThreadId());
|
||||
}
|
||||
}
|
||||
|
||||
template void Profiler::StartProfiling<char>(const std::basic_string<char>& file_name);
|
||||
|
|
@ -101,6 +247,12 @@ std::string Profiler::EndProfiling() {
|
|||
std::lock_guard<OrtMutex> lock(mutex_);
|
||||
profile_stream_ << "[\n";
|
||||
|
||||
DeviceProfiler* device_profiler = DeviceProfiler::GetDeviceProfiler();
|
||||
if (device_profiler) {
|
||||
std::vector<EventRecord> device_events = device_profiler->EndProfiling();
|
||||
std::copy(device_events.begin(), device_events.end(), std::back_inserter(events_));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < events_.size(); ++i) {
|
||||
auto& rec = events_[i];
|
||||
profile_stream_ << R"({"cat" : ")" << event_categor_names_[rec.cat] << "\",";
|
||||
|
|
|
|||
|
|
@ -590,6 +590,11 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
|
|||
so.profile_file_prefix = ORT_TSTR("onnxprofile_profile_test");
|
||||
|
||||
InferenceSession session_object(so, GetEnvironment());
|
||||
#ifdef USE_CUDA
|
||||
CUDAExecutionProviderInfo epi;
|
||||
epi.device_id = 0;
|
||||
EXPECT_TRUE(session_object.RegisterExecutionProvider(onnxruntime::make_unique<CUDAExecutionProvider>(epi)).IsOK());
|
||||
#endif
|
||||
ASSERT_STATUS_OK(session_object.Load(MODEL_URI));
|
||||
ASSERT_STATUS_OK(session_object.Initialize());
|
||||
|
||||
|
|
@ -602,25 +607,29 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
|
|||
std::ifstream profile(profile_file);
|
||||
ASSERT_TRUE(profile);
|
||||
std::string line;
|
||||
std::vector<std::string> lines;
|
||||
|
||||
std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
|
||||
int count = 0;
|
||||
while (std::getline(profile, line)) {
|
||||
if (count == 0) {
|
||||
ASSERT_TRUE(line.find("[") != string::npos);
|
||||
} else if (count <= 7) {
|
||||
for (auto& s : tags) {
|
||||
ASSERT_TRUE(line.find(s) != string::npos);
|
||||
}
|
||||
} else {
|
||||
ASSERT_TRUE(line.find("]") != string::npos);
|
||||
}
|
||||
|
||||
if (count == 1) {
|
||||
ASSERT_TRUE(line.find("model_loading_uri") != string::npos);
|
||||
}
|
||||
count++;
|
||||
lines.push_back(line);
|
||||
}
|
||||
|
||||
auto size = lines.size();
|
||||
ASSERT_TRUE(size > 1);
|
||||
ASSERT_TRUE(lines[0].find("[") != string::npos);
|
||||
ASSERT_TRUE(lines[1].find("model_loading_uri") != string::npos);
|
||||
ASSERT_TRUE(lines[size-1].find("]") != string::npos);
|
||||
std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
|
||||
|
||||
bool has_kernel_info = false;
|
||||
for (size_t i = 1; i < size - 1; ++i) {
|
||||
for (auto& s : tags) {
|
||||
ASSERT_TRUE(lines[i].find(s) != string::npos);
|
||||
has_kernel_info = has_kernel_info || (lines[i].find("Kernel") != string::npos);
|
||||
}
|
||||
}
|
||||
#ifdef USE_CUDA
|
||||
ASSERT_TRUE(has_kernel_info);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
|
||||
|
|
@ -641,24 +650,23 @@ TEST(InferenceSessionTests, CheckRunProfilerWithStartProfile) {
|
|||
|
||||
std::ifstream profile(profile_file);
|
||||
std::string line;
|
||||
std::vector<std::string> lines;
|
||||
|
||||
std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
|
||||
int count = 0;
|
||||
while (std::getline(profile, line)) {
|
||||
if (count == 0) {
|
||||
ASSERT_TRUE(line.find("[") != string::npos);
|
||||
} else if (count <= 5) {
|
||||
for (auto& s : tags) {
|
||||
ASSERT_TRUE(line.find(s) != string::npos);
|
||||
}
|
||||
} else {
|
||||
ASSERT_TRUE(line.find("]") != string::npos);
|
||||
}
|
||||
lines.push_back(line);
|
||||
}
|
||||
|
||||
if (count == 1) {
|
||||
ASSERT_TRUE(line.find("mul_1_fence_before") != string::npos);
|
||||
auto size = lines.size();
|
||||
ASSERT_TRUE(size > 1);
|
||||
ASSERT_TRUE(lines[0].find("[") != string::npos);
|
||||
ASSERT_TRUE(lines[1].find("mul_1_fence_before") != string::npos);
|
||||
ASSERT_TRUE(lines[size - 1].find("]") != string::npos);
|
||||
std::vector<std::string> tags = {"pid", "dur", "ts", "ph", "X", "name", "args"};
|
||||
|
||||
for (size_t i = 1; i < size - 1; ++i) {
|
||||
for (auto& s : tags) {
|
||||
ASSERT_TRUE(lines[i].find(s) != string::npos);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -535,11 +535,13 @@ class TestInferenceSession(unittest.TestCase):
|
|||
tags = ['pid', 'dur', 'ts', 'ph', 'X', 'name', 'args']
|
||||
with open(profile_file) as f:
|
||||
lines = f.readlines()
|
||||
lines_len = len(lines)
|
||||
self.assertTrue(lines_len > 8)
|
||||
self.assertTrue('[' in lines[0])
|
||||
for i in range(1, 8):
|
||||
for i in range(1, lines_len-1):
|
||||
for tag in tags:
|
||||
self.assertTrue(tag in lines[i])
|
||||
self.assertTrue(']' in lines[8])
|
||||
self.assertTrue(']' in lines[-1])
|
||||
|
||||
def testProfilerGetStartTimeNs(self):
|
||||
def getSingleSessionProfilingStartTime():
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
|
||||
set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda\bin;%PATH%
|
||||
set GRADLE_OPTS=-Dorg.gradle.daemon=false
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
|
||||
set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\extras\CUPTI\lib64;C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda\bin;%PATH%
|
||||
set GRADLE_OPTS=-Dorg.gradle.daemon=false
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
|
||||
set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\extras\CUPTI\lib64;C:\local\cudnn-11.1-windows-x64-v8.0.5.39\cuda\bin;%PATH%
|
||||
set GRADLE_OPTS=-Dorg.gradle.daemon=false
|
||||
|
|
|
|||
Loading…
Reference in a new issue