Add kernel explorer (#11779)

* Add kernel explorer, a tool to help develop, test, profile, and tune GPU kernels.

* clean up with some formatting issues

* rename MACRO

* macro renaming

* improve cmake code

* fix python lint errors

* fix python lint errors

* fix python lint errors

* delete white space suggested by lint
This commit is contained in:
zhangyaobit 2022-06-13 20:11:25 -07:00 committed by GitHub
parent 6bf6bac1fd
commit f6d2b629a0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 387 additions and 0 deletions

View file

@ -0,0 +1,33 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
project(kernel_explorer)
cmake_minimum_required(VERSION 3.21)
include(../../../../cmake/external/pybind11.cmake)
# Memory error with Release build (-o3 -DNDEBUG), AMD is currently investigating
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic")
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip)
find_package(hip)
find_package(PythonLibs 3.8 EXACT REQUIRED)
include_directories(${PYTHON_INCLUDE_DIRS})
include_directories(${pybind11_INCLUDE_DIRS})
include_directories(.)
FILE(GLOB kernel_srcs kernels/*.cpp)
add_library(kernel_explorer SHARED kernel_explorer.cpp timer.cpp ${kernel_srcs})
target_link_libraries(kernel_explorer ${PYTHON_LIBRARIES})
set_target_properties(kernel_explorer PROPERTIES PREFIX "")
enable_testing()
find_package(Python COMPONENTS Interpreter REQUIRED)
add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)

View file

@ -0,0 +1,45 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "hip/hip_runtime.h"
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
class DeviceArray {
public:
DeviceArray(py::array x) {
py::buffer_info buf = x.request();
size_ = buf.size;
itemsize_ = buf.itemsize;
HIP_ASSERT(hipMalloc(&x_device_, size_ * itemsize_));
x_host_ = x.request().ptr;
HIP_ASSERT(hipMemcpy(x_device_, x_host_, size_ * itemsize_, hipMemcpyHostToDevice));
}
DeviceArray(const DeviceArray&) = delete;
DeviceArray& operator=(DeviceArray&) = delete;
void UpdateHostNumpyArray() {
HIP_ASSERT(hipMemcpy(x_host_, x_device_, size_ * itemsize_, hipMemcpyDeviceToHost));
}
void* ptr() const {
return x_device_;
}
~DeviceArray() {
HIP_ASSERT(hipFree(x_device_));
}
private:
void* x_device_;
void* x_host_;
ssize_t size_;
ssize_t itemsize_;
};

View file

@ -0,0 +1,16 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "device_array.h"
#include "kernels/vector_add.h"
namespace py = pybind11;
PYBIND11_MODULE(kernel_explorer, m) {
py::class_<DeviceArray>(m, "DeviceArray")
.def(py::init<py::array>())
.def("UpdateHostNumpyArray", &DeviceArray::UpdateHostNumpyArray);
InitVectorAdd(m);
}

View file

@ -0,0 +1,58 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <pybind11/pybind11.h>
#include "hip/hip_fp16.h"
#include "vector_add_kernel.h"
namespace py = pybind11;
template <typename T, int ThreadsPerBlock, int VecSize>
class VectorAdd: public Operator<T> {
public:
VectorAdd(DeviceArray& x, DeviceArray& y, DeviceArray& z, int n) :
x_(reinterpret_cast<T*>(x.ptr())),
y_(reinterpret_cast<T*>(y.ptr())),
z_(reinterpret_cast<T*>(z.ptr())),
n_(n),
Operator<T>() {}
void Run() {
LaunchVectorAdd<T, ThreadsPerBlock, VecSize>(x_, y_, z_, n_);
}
private:
T* x_;
T* y_;
T* z_;
int n_;
};
#define REGISTER_OP(name, type, threads_per_block, vec_size) \
py::class_<name<type, threads_per_block, vec_size>>(m, #name"_"#type"_"#threads_per_block"_"#vec_size) \
.def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, int>()) \
.def("SetRepeats", &name<type, threads_per_block, vec_size>::SetRepeats) \
.def("Profile", &name<type, threads_per_block, vec_size>::Profile) \
.def("Run", &name<type, threads_per_block, vec_size>::Run);
#define REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, threads_per_block) \
REGISTER_OP(name, type, threads_per_block, 1) \
REGISTER_OP(name, type, threads_per_block, 2) \
REGISTER_OP(name, type, threads_per_block, 4) \
REGISTER_OP(name, type, threads_per_block, 8)
#define REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(name, type) \
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 64) \
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 128) \
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 192) \
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 256) \
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 320) \
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 384) \
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 448) \
REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 512)
void InitVectorAdd(py::module m) {
REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(VectorAdd, half);
REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(VectorAdd, float);
}

View file

@ -0,0 +1,10 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <pybind11/pybind11.h>
namespace py = pybind11;
void InitVectorAdd(py::module m);

View file

@ -0,0 +1,56 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "device_array.h"
#include "operator.h"
// aligned vector for vectorized load/store
template<typename T, int VecSize>
struct alignas(sizeof(T) * VecSize) AlignedVector {
T val[VecSize];
};
template <typename T, int VecSize>
__global__ void VectorAddKernel(const T* __restrict__ x,
const T* __restrict__ y,
T* __restrict__ z, int n) {
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
using LoadT = AlignedVector<T, VecSize>;
if (VecSize * i + VecSize - 1 < n) {
T x_vec[VecSize];
LoadT* x_load = reinterpret_cast<LoadT*>(&x_vec);
*x_load = *reinterpret_cast<const LoadT*>(&x[VecSize * i]);
T y_vec[VecSize];
LoadT* y_load = reinterpret_cast<LoadT*>(&y_vec);
*y_load = *reinterpret_cast<const LoadT*>(&y[VecSize * i]);
T z_vec[VecSize];
#pragma unroll
for (int j = 0; j < VecSize; j++) {
z_vec[j] = x_vec[j] + y_vec[j];
}
*(reinterpret_cast<LoadT*>(&z[VecSize * i])) = *reinterpret_cast<LoadT*>(&z_vec[0]);
}
if (i == 0) {
int tail_size = n % VecSize;
for (int j = n - 1; j >= n - tail_size; j--) {
z[j] = x[j] + y[j];
}
}
}
template <typename T, int ThreadsPerBlock, int VecSize>
void LaunchVectorAdd(const T* x, const T* y, T* z, int n) {
hipLaunchKernelGGL((VectorAddKernel<T, VecSize>),
dim3(ceil(float(n)/(float(ThreadsPerBlock)*VecSize))),
dim3(ThreadsPerBlock),
0, 0,
x, y, z, n);
}

View file

@ -0,0 +1,85 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import sys
sys.path.append("../build")
import kernel_explorer as ke
import numpy as np
import pytest
def dtype_to_bytes(dtype):
type_map = {
"float16": 2,
"float32": 4,
}
return type_map[dtype]
def dtype_to_funcs(dtype):
type_map = {
"float16": list(filter(lambda x: "VectorAdd_half" in x, dir(ke))),
"float32": list(filter(lambda x: "VectorAdd_float" in x, dir(ke))),
}
return type_map[dtype]
@pytest.mark.skip(reason="called by test_vector_add_all_sizes")
def test_vector_add(size, dtype, func):
np.random.seed(0)
x = np.random.rand(size).astype(dtype)
y = np.random.rand(size).astype(dtype)
z = np.random.rand(size).astype(dtype)
x_d = ke.DeviceArray(x)
y_d = ke.DeviceArray(y)
z_d = ke.DeviceArray(z)
f = getattr(ke, func)
va = f(x_d, y_d, z_d, size)
va.Run()
z_d.UpdateHostNumpyArray()
z_ref = x + y
np.testing.assert_allclose(z_ref, z)
@pytest.mark.parametrize("size", [1, 3, 4, 16, 124, 125, 126, 127, 128, 129, 130, 131, 132, 1024])
def test_vector_add_all_sizes(size):
dtypes = ["float16", "float32"]
for dtype in dtypes:
for f in dtype_to_funcs(dtype):
test_vector_add(size, dtype, f)
def profile_vector_add_func(size, dtype, func):
np.random.seed(0)
x = np.random.rand(size).astype(dtype)
y = np.random.rand(size).astype(dtype)
z = np.random.rand(size).astype(dtype)
x_d = ke.DeviceArray(x)
y_d = ke.DeviceArray(y)
z_d = ke.DeviceArray(z)
f = getattr(ke, func)
va = f(x_d, y_d, z_d, size)
t = va.Profile()
print(dtype, size, f, f"{t*1000:.2f} us", f"{size*3*(dtype_to_bytes(dtype))*1e3/t/1e9:.2f} GB/s")
def profile():
sizes = [10000, 100000, 1000000, 10000000]
dtypes = ["float16", "float32"]
for dt in dtypes:
for s in sizes:
for f in dtype_to_funcs(dt):
profile_vector_add_func(s, dt, f)
print()
print()
if __name__ == "__main__":
profile()

View file

@ -0,0 +1,35 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "timer.h"
template <typename T>
class Operator {
public:
Operator() : repeats_(100) {}
virtual void Run() = 0;
void SetRepeats(int n) {
repeats_ = n;
}
float Profile() {
// warm up
for (int i = 0; i < 5; i++) {
Run();
}
Timer timer;
timer.Start();
for (int i = 0; i < repeats_; i++) {
Run();
}
timer.End();
return timer.time()/repeats_;
}
private:
int repeats_;
};

View file

@ -0,0 +1,31 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "timer.h"
Timer::Timer() {
hipEventCreate(&start_);
hipEventCreate(&end_);
}
void Timer::Start() {
hipDeviceSynchronize();
hipEventRecord(start_, nullptr);
}
void Timer::End() {
hipEventRecord(end_, nullptr);
hipEventSynchronize(end_);
}
float Timer::time() {
float time;
// time is in ms with a resolution of 1 us
hipEventElapsedTime(&time, start_, end_);
return time;
}
Timer::~Timer() {
hipEventDestroy(start_);
hipEventDestroy(end_);
}

View file

@ -0,0 +1,18 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "hip/hip_runtime.h"
class Timer {
public:
Timer();
void Start();
void End();
float time();
~Timer();
private:
hipEvent_t start_, end_;
};