diff --git a/onnxruntime/python/tools/kernel_explorer/CMakeLists.txt b/onnxruntime/python/tools/kernel_explorer/CMakeLists.txt new file mode 100644 index 0000000000..7ede7cbe38 --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +project(kernel_explorer) +cmake_minimum_required(VERSION 3.21) + +include(../../../../cmake/external/pybind11.cmake) + +# Memory error with Release build (-o3 -DNDEBUG), AMD is currently investigating +set(CMAKE_BUILD_TYPE Debug) +set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic") +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip) + +find_package(hip) +find_package(PythonLibs 3.8 EXACT REQUIRED) + +include_directories(${PYTHON_INCLUDE_DIRS}) +include_directories(${pybind11_INCLUDE_DIRS}) +include_directories(.) + +FILE(GLOB kernel_srcs kernels/*.cpp) +add_library(kernel_explorer SHARED kernel_explorer.cpp timer.cpp ${kernel_srcs}) +target_link_libraries(kernel_explorer ${PYTHON_LIBRARIES}) +set_target_properties(kernel_explorer PROPERTIES PREFIX "") + +enable_testing() +find_package(Python COMPONENTS Interpreter REQUIRED) +add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..) diff --git a/onnxruntime/python/tools/kernel_explorer/device_array.h b/onnxruntime/python/tools/kernel_explorer/device_array.h new file mode 100644 index 0000000000..003d5e48e4 --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/device_array.h @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "hip/hip_runtime.h" + +#include +#include + +namespace py = pybind11; + +#define HIP_ASSERT(x) (assert((x)==hipSuccess)) + +class DeviceArray { + public: + DeviceArray(py::array x) { + py::buffer_info buf = x.request(); + size_ = buf.size; + itemsize_ = buf.itemsize; + HIP_ASSERT(hipMalloc(&x_device_, size_ * itemsize_)); + x_host_ = x.request().ptr; + HIP_ASSERT(hipMemcpy(x_device_, x_host_, size_ * itemsize_, hipMemcpyHostToDevice)); + } + DeviceArray(const DeviceArray&) = delete; + DeviceArray& operator=(DeviceArray&) = delete; + + void UpdateHostNumpyArray() { + HIP_ASSERT(hipMemcpy(x_host_, x_device_, size_ * itemsize_, hipMemcpyDeviceToHost)); + } + + void* ptr() const { + return x_device_; + } + + ~DeviceArray() { + HIP_ASSERT(hipFree(x_device_)); + } + + private: + void* x_device_; + void* x_host_; + ssize_t size_; + ssize_t itemsize_; +}; diff --git a/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cpp b/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cpp new file mode 100644 index 0000000000..cad5e485f4 --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cpp @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include "device_array.h" +#include "kernels/vector_add.h" + +namespace py = pybind11; + +PYBIND11_MODULE(kernel_explorer, m) { + py::class_(m, "DeviceArray") + .def(py::init()) + .def("UpdateHostNumpyArray", &DeviceArray::UpdateHostNumpyArray); + InitVectorAdd(m); +} diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.cpp b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.cpp new file mode 100644 index 0000000000..d2e4a9048a --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.cpp @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include "hip/hip_fp16.h" +#include "vector_add_kernel.h" + +namespace py = pybind11; + +template +class VectorAdd: public Operator { + public: + VectorAdd(DeviceArray& x, DeviceArray& y, DeviceArray& z, int n) : + x_(reinterpret_cast(x.ptr())), + y_(reinterpret_cast(y.ptr())), + z_(reinterpret_cast(z.ptr())), + n_(n), + Operator() {} + + void Run() { + LaunchVectorAdd(x_, y_, z_, n_); + } + + private: + T* x_; + T* y_; + T* z_; + int n_; +}; + +#define REGISTER_OP(name, type, threads_per_block, vec_size) \ + py::class_>(m, #name"_"#type"_"#threads_per_block"_"#vec_size) \ + .def(py::init()) \ + .def("SetRepeats", &name::SetRepeats) \ + .def("Profile", &name::Profile) \ + .def("Run", &name::Run); + +#define REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, threads_per_block) \ + REGISTER_OP(name, type, threads_per_block, 1) \ + REGISTER_OP(name, type, threads_per_block, 2) \ + REGISTER_OP(name, type, threads_per_block, 4) \ + REGISTER_OP(name, type, threads_per_block, 8) + +#define REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(name, type) \ + REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 64) \ + REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 128) \ + REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 192) \ + REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 256) \ + REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 320) \ + REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 384) \ + REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 448) \ + REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 512) + + +void InitVectorAdd(py::module m) { + REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(VectorAdd, half); + REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(VectorAdd, float); +} diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.h b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.h new file mode 100644 index 0000000000..1d957f6198 --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.h @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +namespace py = pybind11; + +void InitVectorAdd(py::module m); diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_kernel.h b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_kernel.h new file mode 100644 index 0000000000..079342c53c --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_kernel.h @@ -0,0 +1,56 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "device_array.h" +#include "operator.h" + +// aligned vector for vectorized load/store +template +struct alignas(sizeof(T) * VecSize) AlignedVector { + T val[VecSize]; +}; + +template +__global__ void VectorAddKernel(const T* __restrict__ x, + const T* __restrict__ y, + T* __restrict__ z, int n) { + int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + using LoadT = AlignedVector; + + if (VecSize * i + VecSize - 1 < n) { + T x_vec[VecSize]; + LoadT* x_load = reinterpret_cast(&x_vec); + *x_load = *reinterpret_cast(&x[VecSize * i]); + + T y_vec[VecSize]; + LoadT* y_load = reinterpret_cast(&y_vec); + *y_load = *reinterpret_cast(&y[VecSize * i]); + + T z_vec[VecSize]; + + #pragma unroll + for (int j = 0; j < VecSize; j++) { + z_vec[j] = x_vec[j] + y_vec[j]; + } + + *(reinterpret_cast(&z[VecSize * i])) = *reinterpret_cast(&z_vec[0]); + } + + if (i == 0) { + int tail_size = n % VecSize; + for (int j = n - 1; j >= n - tail_size; j--) { + z[j] = x[j] + y[j]; + } + } +} + +template +void LaunchVectorAdd(const T* x, const T* y, T* z, int n) { + hipLaunchKernelGGL((VectorAddKernel), + dim3(ceil(float(n)/(float(ThreadsPerBlock)*VecSize))), + dim3(ThreadsPerBlock), + 0, 0, + x, y, z, n); +} diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py new file mode 100644 index 0000000000..16a949b67a --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py @@ -0,0 +1,85 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import sys + +sys.path.append("../build") + +import kernel_explorer as ke +import numpy as np +import pytest + + +def dtype_to_bytes(dtype): + type_map = { + "float16": 2, + "float32": 4, + } + return type_map[dtype] + + +def dtype_to_funcs(dtype): + type_map = { + "float16": list(filter(lambda x: "VectorAdd_half" in x, dir(ke))), + "float32": list(filter(lambda x: "VectorAdd_float" in x, dir(ke))), + } + return type_map[dtype] + + +@pytest.mark.skip(reason="called by test_vector_add_all_sizes") +def test_vector_add(size, dtype, func): + np.random.seed(0) + x = np.random.rand(size).astype(dtype) + y = np.random.rand(size).astype(dtype) + z = np.random.rand(size).astype(dtype) + + x_d = ke.DeviceArray(x) + y_d = ke.DeviceArray(y) + z_d = ke.DeviceArray(z) + f = getattr(ke, func) + va = f(x_d, y_d, z_d, size) + va.Run() + z_d.UpdateHostNumpyArray() + + z_ref = x + y + np.testing.assert_allclose(z_ref, z) + + +@pytest.mark.parametrize("size", [1, 3, 4, 16, 124, 125, 126, 127, 128, 129, 130, 131, 132, 1024]) +def test_vector_add_all_sizes(size): + dtypes = ["float16", "float32"] + for dtype in dtypes: + for f in dtype_to_funcs(dtype): + test_vector_add(size, dtype, f) + + +def profile_vector_add_func(size, dtype, func): + np.random.seed(0) + x = np.random.rand(size).astype(dtype) + y = np.random.rand(size).astype(dtype) + z = np.random.rand(size).astype(dtype) + + x_d = ke.DeviceArray(x) + y_d = ke.DeviceArray(y) + z_d = ke.DeviceArray(z) + f = getattr(ke, func) + va = f(x_d, y_d, z_d, size) + t = va.Profile() + print(dtype, size, f, f"{t*1000:.2f} us", f"{size*3*(dtype_to_bytes(dtype))*1e3/t/1e9:.2f} GB/s") + + +def profile(): + sizes = [10000, 100000, 1000000, 10000000] + dtypes = ["float16", "float32"] + for dt in dtypes: + for s in sizes: + for f in dtype_to_funcs(dt): + profile_vector_add_func(s, dt, f) + print() + print() + + +if __name__ == "__main__": + profile() diff --git a/onnxruntime/python/tools/kernel_explorer/operator.h b/onnxruntime/python/tools/kernel_explorer/operator.h new file mode 100644 index 0000000000..2d27cd2714 --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/operator.h @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "timer.h" + +template +class Operator { + public: + Operator() : repeats_(100) {} + + virtual void Run() = 0; + + void SetRepeats(int n) { + repeats_ = n; + } + + float Profile() { + // warm up + for (int i = 0; i < 5; i++) { + Run(); + } + Timer timer; + timer.Start(); + for (int i = 0; i < repeats_; i++) { + Run(); + } + timer.End(); + return timer.time()/repeats_; + } + + private: + int repeats_; +}; diff --git a/onnxruntime/python/tools/kernel_explorer/timer.cpp b/onnxruntime/python/tools/kernel_explorer/timer.cpp new file mode 100644 index 0000000000..1a38b6329d --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/timer.cpp @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "timer.h" + +Timer::Timer() { + hipEventCreate(&start_); + hipEventCreate(&end_); +} + +void Timer::Start() { + hipDeviceSynchronize(); + hipEventRecord(start_, nullptr); +} + +void Timer::End() { + hipEventRecord(end_, nullptr); + hipEventSynchronize(end_); +} + +float Timer::time() { + float time; + // time is in ms with a resolution of 1 us + hipEventElapsedTime(&time, start_, end_); + return time; +} + +Timer::~Timer() { + hipEventDestroy(start_); + hipEventDestroy(end_); +} diff --git a/onnxruntime/python/tools/kernel_explorer/timer.h b/onnxruntime/python/tools/kernel_explorer/timer.h new file mode 100644 index 0000000000..66bfb5511b --- /dev/null +++ b/onnxruntime/python/tools/kernel_explorer/timer.h @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "hip/hip_runtime.h" + +class Timer { + public: + Timer(); + void Start(); + void End(); + float time(); + ~Timer(); + + private: + hipEvent_t start_, end_; +};