mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-29 23:06:41 +00:00
Add kernel explorer (#11779)
* Add kernel explorer, a tool to help develop, test, profile, and tune GPU kernels. * clean up with some formatting issues * rename MACRO * macro renaming * improve cmake code * fix python lint errors * fix python lint errors * fix python lint errors * delete white space suggested by lint
This commit is contained in:
parent
6bf6bac1fd
commit
f6d2b629a0
10 changed files with 387 additions and 0 deletions
33
onnxruntime/python/tools/kernel_explorer/CMakeLists.txt
Normal file
33
onnxruntime/python/tools/kernel_explorer/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
project(kernel_explorer)
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
|
||||
include(../../../../cmake/external/pybind11.cmake)
|
||||
|
||||
# Memory error with Release build (-o3 -DNDEBUG), AMD is currently investigating
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic")
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip)
|
||||
|
||||
find_package(hip)
|
||||
find_package(PythonLibs 3.8 EXACT REQUIRED)
|
||||
|
||||
include_directories(${PYTHON_INCLUDE_DIRS})
|
||||
include_directories(${pybind11_INCLUDE_DIRS})
|
||||
include_directories(.)
|
||||
|
||||
FILE(GLOB kernel_srcs kernels/*.cpp)
|
||||
add_library(kernel_explorer SHARED kernel_explorer.cpp timer.cpp ${kernel_srcs})
|
||||
target_link_libraries(kernel_explorer ${PYTHON_LIBRARIES})
|
||||
set_target_properties(kernel_explorer PROPERTIES PREFIX "")
|
||||
|
||||
enable_testing()
|
||||
find_package(Python COMPONENTS Interpreter REQUIRED)
|
||||
add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
|
||||
45
onnxruntime/python/tools/kernel_explorer/device_array.h
Normal file
45
onnxruntime/python/tools/kernel_explorer/device_array.h
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/numpy.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
|
||||
|
||||
class DeviceArray {
|
||||
public:
|
||||
DeviceArray(py::array x) {
|
||||
py::buffer_info buf = x.request();
|
||||
size_ = buf.size;
|
||||
itemsize_ = buf.itemsize;
|
||||
HIP_ASSERT(hipMalloc(&x_device_, size_ * itemsize_));
|
||||
x_host_ = x.request().ptr;
|
||||
HIP_ASSERT(hipMemcpy(x_device_, x_host_, size_ * itemsize_, hipMemcpyHostToDevice));
|
||||
}
|
||||
DeviceArray(const DeviceArray&) = delete;
|
||||
DeviceArray& operator=(DeviceArray&) = delete;
|
||||
|
||||
void UpdateHostNumpyArray() {
|
||||
HIP_ASSERT(hipMemcpy(x_host_, x_device_, size_ * itemsize_, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void* ptr() const {
|
||||
return x_device_;
|
||||
}
|
||||
|
||||
~DeviceArray() {
|
||||
HIP_ASSERT(hipFree(x_device_));
|
||||
}
|
||||
|
||||
private:
|
||||
void* x_device_;
|
||||
void* x_host_;
|
||||
ssize_t size_;
|
||||
ssize_t itemsize_;
|
||||
};
|
||||
16
onnxruntime/python/tools/kernel_explorer/kernel_explorer.cpp
Normal file
16
onnxruntime/python/tools/kernel_explorer/kernel_explorer.cpp
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/numpy.h>
|
||||
#include "device_array.h"
|
||||
#include "kernels/vector_add.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
PYBIND11_MODULE(kernel_explorer, m) {
|
||||
py::class_<DeviceArray>(m, "DeviceArray")
|
||||
.def(py::init<py::array>())
|
||||
.def("UpdateHostNumpyArray", &DeviceArray::UpdateHostNumpyArray);
|
||||
InitVectorAdd(m);
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include "hip/hip_fp16.h"
|
||||
#include "vector_add_kernel.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
template <typename T, int ThreadsPerBlock, int VecSize>
|
||||
class VectorAdd: public Operator<T> {
|
||||
public:
|
||||
VectorAdd(DeviceArray& x, DeviceArray& y, DeviceArray& z, int n) :
|
||||
x_(reinterpret_cast<T*>(x.ptr())),
|
||||
y_(reinterpret_cast<T*>(y.ptr())),
|
||||
z_(reinterpret_cast<T*>(z.ptr())),
|
||||
n_(n),
|
||||
Operator<T>() {}
|
||||
|
||||
void Run() {
|
||||
LaunchVectorAdd<T, ThreadsPerBlock, VecSize>(x_, y_, z_, n_);
|
||||
}
|
||||
|
||||
private:
|
||||
T* x_;
|
||||
T* y_;
|
||||
T* z_;
|
||||
int n_;
|
||||
};
|
||||
|
||||
#define REGISTER_OP(name, type, threads_per_block, vec_size) \
|
||||
py::class_<name<type, threads_per_block, vec_size>>(m, #name"_"#type"_"#threads_per_block"_"#vec_size) \
|
||||
.def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, int>()) \
|
||||
.def("SetRepeats", &name<type, threads_per_block, vec_size>::SetRepeats) \
|
||||
.def("Profile", &name<type, threads_per_block, vec_size>::Profile) \
|
||||
.def("Run", &name<type, threads_per_block, vec_size>::Run);
|
||||
|
||||
#define REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, threads_per_block) \
|
||||
REGISTER_OP(name, type, threads_per_block, 1) \
|
||||
REGISTER_OP(name, type, threads_per_block, 2) \
|
||||
REGISTER_OP(name, type, threads_per_block, 4) \
|
||||
REGISTER_OP(name, type, threads_per_block, 8)
|
||||
|
||||
#define REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(name, type) \
|
||||
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 64) \
|
||||
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 128) \
|
||||
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 192) \
|
||||
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 256) \
|
||||
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 320) \
|
||||
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 384) \
|
||||
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 448) \
|
||||
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 512)
|
||||
|
||||
|
||||
void InitVectorAdd(py::module m) {
|
||||
REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(VectorAdd, half);
|
||||
REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(VectorAdd, float);
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void InitVectorAdd(py::module m);
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "device_array.h"
|
||||
#include "operator.h"
|
||||
|
||||
// aligned vector for vectorized load/store
|
||||
template<typename T, int VecSize>
|
||||
struct alignas(sizeof(T) * VecSize) AlignedVector {
|
||||
T val[VecSize];
|
||||
};
|
||||
|
||||
template <typename T, int VecSize>
|
||||
__global__ void VectorAddKernel(const T* __restrict__ x,
|
||||
const T* __restrict__ y,
|
||||
T* __restrict__ z, int n) {
|
||||
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
using LoadT = AlignedVector<T, VecSize>;
|
||||
|
||||
if (VecSize * i + VecSize - 1 < n) {
|
||||
T x_vec[VecSize];
|
||||
LoadT* x_load = reinterpret_cast<LoadT*>(&x_vec);
|
||||
*x_load = *reinterpret_cast<const LoadT*>(&x[VecSize * i]);
|
||||
|
||||
T y_vec[VecSize];
|
||||
LoadT* y_load = reinterpret_cast<LoadT*>(&y_vec);
|
||||
*y_load = *reinterpret_cast<const LoadT*>(&y[VecSize * i]);
|
||||
|
||||
T z_vec[VecSize];
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VecSize; j++) {
|
||||
z_vec[j] = x_vec[j] + y_vec[j];
|
||||
}
|
||||
|
||||
*(reinterpret_cast<LoadT*>(&z[VecSize * i])) = *reinterpret_cast<LoadT*>(&z_vec[0]);
|
||||
}
|
||||
|
||||
if (i == 0) {
|
||||
int tail_size = n % VecSize;
|
||||
for (int j = n - 1; j >= n - tail_size; j--) {
|
||||
z[j] = x[j] + y[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int ThreadsPerBlock, int VecSize>
|
||||
void LaunchVectorAdd(const T* x, const T* y, T* z, int n) {
|
||||
hipLaunchKernelGGL((VectorAddKernel<T, VecSize>),
|
||||
dim3(ceil(float(n)/(float(ThreadsPerBlock)*VecSize))),
|
||||
dim3(ThreadsPerBlock),
|
||||
0, 0,
|
||||
x, y, z, n);
|
||||
}
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
|
||||
sys.path.append("../build")
|
||||
|
||||
import kernel_explorer as ke
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
|
||||
def dtype_to_bytes(dtype):
|
||||
type_map = {
|
||||
"float16": 2,
|
||||
"float32": 4,
|
||||
}
|
||||
return type_map[dtype]
|
||||
|
||||
|
||||
def dtype_to_funcs(dtype):
|
||||
type_map = {
|
||||
"float16": list(filter(lambda x: "VectorAdd_half" in x, dir(ke))),
|
||||
"float32": list(filter(lambda x: "VectorAdd_float" in x, dir(ke))),
|
||||
}
|
||||
return type_map[dtype]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="called by test_vector_add_all_sizes")
|
||||
def test_vector_add(size, dtype, func):
|
||||
np.random.seed(0)
|
||||
x = np.random.rand(size).astype(dtype)
|
||||
y = np.random.rand(size).astype(dtype)
|
||||
z = np.random.rand(size).astype(dtype)
|
||||
|
||||
x_d = ke.DeviceArray(x)
|
||||
y_d = ke.DeviceArray(y)
|
||||
z_d = ke.DeviceArray(z)
|
||||
f = getattr(ke, func)
|
||||
va = f(x_d, y_d, z_d, size)
|
||||
va.Run()
|
||||
z_d.UpdateHostNumpyArray()
|
||||
|
||||
z_ref = x + y
|
||||
np.testing.assert_allclose(z_ref, z)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("size", [1, 3, 4, 16, 124, 125, 126, 127, 128, 129, 130, 131, 132, 1024])
|
||||
def test_vector_add_all_sizes(size):
|
||||
dtypes = ["float16", "float32"]
|
||||
for dtype in dtypes:
|
||||
for f in dtype_to_funcs(dtype):
|
||||
test_vector_add(size, dtype, f)
|
||||
|
||||
|
||||
def profile_vector_add_func(size, dtype, func):
|
||||
np.random.seed(0)
|
||||
x = np.random.rand(size).astype(dtype)
|
||||
y = np.random.rand(size).astype(dtype)
|
||||
z = np.random.rand(size).astype(dtype)
|
||||
|
||||
x_d = ke.DeviceArray(x)
|
||||
y_d = ke.DeviceArray(y)
|
||||
z_d = ke.DeviceArray(z)
|
||||
f = getattr(ke, func)
|
||||
va = f(x_d, y_d, z_d, size)
|
||||
t = va.Profile()
|
||||
print(dtype, size, f, f"{t*1000:.2f} us", f"{size*3*(dtype_to_bytes(dtype))*1e3/t/1e9:.2f} GB/s")
|
||||
|
||||
|
||||
def profile():
|
||||
sizes = [10000, 100000, 1000000, 10000000]
|
||||
dtypes = ["float16", "float32"]
|
||||
for dt in dtypes:
|
||||
for s in sizes:
|
||||
for f in dtype_to_funcs(dt):
|
||||
profile_vector_add_func(s, dt, f)
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
profile()
|
||||
35
onnxruntime/python/tools/kernel_explorer/operator.h
Normal file
35
onnxruntime/python/tools/kernel_explorer/operator.h
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "timer.h"
|
||||
|
||||
template <typename T>
|
||||
class Operator {
|
||||
public:
|
||||
Operator() : repeats_(100) {}
|
||||
|
||||
virtual void Run() = 0;
|
||||
|
||||
void SetRepeats(int n) {
|
||||
repeats_ = n;
|
||||
}
|
||||
|
||||
float Profile() {
|
||||
// warm up
|
||||
for (int i = 0; i < 5; i++) {
|
||||
Run();
|
||||
}
|
||||
Timer timer;
|
||||
timer.Start();
|
||||
for (int i = 0; i < repeats_; i++) {
|
||||
Run();
|
||||
}
|
||||
timer.End();
|
||||
return timer.time()/repeats_;
|
||||
}
|
||||
|
||||
private:
|
||||
int repeats_;
|
||||
};
|
||||
31
onnxruntime/python/tools/kernel_explorer/timer.cpp
Normal file
31
onnxruntime/python/tools/kernel_explorer/timer.cpp
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "timer.h"
|
||||
|
||||
Timer::Timer() {
|
||||
hipEventCreate(&start_);
|
||||
hipEventCreate(&end_);
|
||||
}
|
||||
|
||||
void Timer::Start() {
|
||||
hipDeviceSynchronize();
|
||||
hipEventRecord(start_, nullptr);
|
||||
}
|
||||
|
||||
void Timer::End() {
|
||||
hipEventRecord(end_, nullptr);
|
||||
hipEventSynchronize(end_);
|
||||
}
|
||||
|
||||
float Timer::time() {
|
||||
float time;
|
||||
// time is in ms with a resolution of 1 us
|
||||
hipEventElapsedTime(&time, start_, end_);
|
||||
return time;
|
||||
}
|
||||
|
||||
Timer::~Timer() {
|
||||
hipEventDestroy(start_);
|
||||
hipEventDestroy(end_);
|
||||
}
|
||||
18
onnxruntime/python/tools/kernel_explorer/timer.h
Normal file
18
onnxruntime/python/tools/kernel_explorer/timer.h
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
class Timer {
|
||||
public:
|
||||
Timer();
|
||||
void Start();
|
||||
void End();
|
||||
float time();
|
||||
~Timer();
|
||||
|
||||
private:
|
||||
hipEvent_t start_, end_;
|
||||
};
|
||||
Loading…
Reference in a new issue