diff --git a/onnxruntime/python/tools/kernel_explorer/CMakeLists.txt b/onnxruntime/python/tools/kernel_explorer/CMakeLists.txt
new file mode 100644
index 0000000000..7ede7cbe38
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+project(kernel_explorer)
+cmake_minimum_required(VERSION 3.21)
+
+include(../../../../cmake/external/pybind11.cmake)
+
+# Memory error with Release build (-o3 -DNDEBUG), AMD is currently investigating
+set(CMAKE_BUILD_TYPE Debug)
+set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic")
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip)
+
+find_package(hip)
+find_package(PythonLibs 3.8 EXACT REQUIRED)
+
+include_directories(${PYTHON_INCLUDE_DIRS})
+include_directories(${pybind11_INCLUDE_DIRS})
+include_directories(.)
+
+FILE(GLOB kernel_srcs kernels/*.cpp)
+add_library(kernel_explorer SHARED kernel_explorer.cpp timer.cpp ${kernel_srcs})
+target_link_libraries(kernel_explorer ${PYTHON_LIBRARIES})
+set_target_properties(kernel_explorer PROPERTIES PREFIX "")
+
+enable_testing()
+find_package(Python COMPONENTS Interpreter REQUIRED)
+add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
diff --git a/onnxruntime/python/tools/kernel_explorer/device_array.h b/onnxruntime/python/tools/kernel_explorer/device_array.h
new file mode 100644
index 0000000000..003d5e48e4
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/device_array.h
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "hip/hip_runtime.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+namespace py = pybind11;
+
+#define HIP_ASSERT(x) (assert((x)==hipSuccess))
+
+class DeviceArray {
+ public:
+  DeviceArray(py::array x) {
+    py::buffer_info buf = x.request();
+    size_ = buf.size;
+    itemsize_ = buf.itemsize;
+    HIP_ASSERT(hipMalloc(&x_device_, size_ * itemsize_));
+    x_host_ = x.request().ptr;
+    HIP_ASSERT(hipMemcpy(x_device_, x_host_, size_ * itemsize_, hipMemcpyHostToDevice));
+  }
+  DeviceArray(const DeviceArray&) = delete;
+  DeviceArray& operator=(DeviceArray&) = delete;
+
+  void UpdateHostNumpyArray() {
+    HIP_ASSERT(hipMemcpy(x_host_, x_device_, size_ * itemsize_, hipMemcpyDeviceToHost));
+  }
+
+  void* ptr() const {
+    return x_device_;
+  }
+
+  ~DeviceArray() {
+    HIP_ASSERT(hipFree(x_device_));
+  }
+
+ private:
+  void* x_device_;
+  void* x_host_;
+  ssize_t size_;
+  ssize_t itemsize_;
+};
diff --git a/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cpp b/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cpp
new file mode 100644
index 0000000000..cad5e485f4
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cpp
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include "device_array.h"
+#include "kernels/vector_add.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(kernel_explorer, m) {
+  py::class_<DeviceArray>(m, "DeviceArray")
+    .def(py::init<py::array>())
+    .def("UpdateHostNumpyArray", &DeviceArray::UpdateHostNumpyArray);
+  InitVectorAdd(m);
+}
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.cpp b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.cpp
new file mode 100644
index 0000000000..d2e4a9048a
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <pybind11/pybind11.h>
+#include "hip/hip_fp16.h"
+#include "vector_add_kernel.h"
+
+namespace py = pybind11;
+
+template <typename T, int ThreadsPerBlock, int VecSize>
+class VectorAdd: public Operator<T> {
+ public:
+  VectorAdd(DeviceArray& x, DeviceArray& y, DeviceArray& z, int n) :
+    x_(reinterpret_cast<T*>(x.ptr())),
+    y_(reinterpret_cast<T*>(y.ptr())),
+    z_(reinterpret_cast<T*>(z.ptr())),
+    n_(n),
+    Operator<T>() {}
+
+  void Run() {
+    LaunchVectorAdd<T, ThreadsPerBlock, VecSize>(x_, y_, z_, n_);
+  }
+
+ private:
+  T* x_;
+  T* y_;
+  T* z_;
+  int n_;
+};
+
+#define REGISTER_OP(name, type, threads_per_block, vec_size)                                              \
+  py::class_<name<type, threads_per_block, vec_size>>(m, #name"_"#type"_"#threads_per_block"_"#vec_size)  \
+    .def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, int>())                                       \
+    .def("SetRepeats", &name<type, threads_per_block, vec_size>::SetRepeats)                              \
+    .def("Profile", &name<type, threads_per_block, vec_size>::Profile)                                    \
+    .def("Run", &name<type, threads_per_block, vec_size>::Run);
+  
+#define REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, threads_per_block)  \
+  REGISTER_OP(name, type, threads_per_block, 1)               \
+  REGISTER_OP(name, type, threads_per_block, 2)               \
+  REGISTER_OP(name, type, threads_per_block, 4)               \
+  REGISTER_OP(name, type, threads_per_block, 8)
+
+#define REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(name, type)  \
+  REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 64)          \
+  REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 128)         \
+  REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 192)         \
+  REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 256)         \
+  REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 320)         \
+  REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 384)         \
+  REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 448)         \
+  REGISTER_OP_FOR_ALL_VEC_SIZE(name, type, 512)
+  
+
+void InitVectorAdd(py::module m) {
+  REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(VectorAdd, half);
+  REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK(VectorAdd, float);
+}
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.h b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.h
new file mode 100644
index 0000000000..1d957f6198
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add.h
@@ -0,0 +1,10 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+void InitVectorAdd(py::module m);
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_kernel.h b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_kernel.h
new file mode 100644
index 0000000000..079342c53c
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_kernel.h
@@ -0,0 +1,56 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "device_array.h"
+#include "operator.h"
+
+// aligned vector for vectorized load/store
+template<typename T, int VecSize>
+struct alignas(sizeof(T) * VecSize) AlignedVector {
+  T val[VecSize];
+};
+
+template <typename T, int VecSize>
+__global__ void VectorAddKernel(const T* __restrict__ x,
+                                  const T* __restrict__ y,
+                                  T* __restrict__ z, int n) {
+  int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  using LoadT = AlignedVector<T, VecSize>;
+
+  if (VecSize * i + VecSize - 1 < n) {
+    T x_vec[VecSize];
+    LoadT* x_load = reinterpret_cast<LoadT*>(&x_vec);
+    *x_load = *reinterpret_cast<const LoadT*>(&x[VecSize * i]);
+
+    T y_vec[VecSize];
+    LoadT* y_load = reinterpret_cast<LoadT*>(&y_vec);
+    *y_load = *reinterpret_cast<const LoadT*>(&y[VecSize * i]);
+
+    T z_vec[VecSize];
+
+    #pragma unroll
+    for (int j = 0; j < VecSize; j++) {
+      z_vec[j] = x_vec[j] + y_vec[j];
+    }
+
+    *(reinterpret_cast<LoadT*>(&z[VecSize * i])) = *reinterpret_cast<LoadT*>(&z_vec[0]);
+  }
+
+  if (i == 0) {
+    int tail_size = n % VecSize;
+    for (int j = n - 1; j >= n - tail_size; j--) {
+      z[j] = x[j] + y[j];
+    }
+  }
+}
+
+template <typename T, int ThreadsPerBlock, int VecSize>
+void LaunchVectorAdd(const T* x, const T* y, T* z, int n) {
+  hipLaunchKernelGGL((VectorAddKernel<T, VecSize>), 
+                  dim3(ceil(float(n)/(float(ThreadsPerBlock)*VecSize))),
+                  dim3(ThreadsPerBlock),
+                  0, 0,
+                  x, y, z, n);
+}
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py
new file mode 100644
index 0000000000..16a949b67a
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py
@@ -0,0 +1,85 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import sys
+
+sys.path.append("../build")
+
+import kernel_explorer as ke
+import numpy as np
+import pytest
+
+
+def dtype_to_bytes(dtype):
+    type_map = {
+        "float16": 2,
+        "float32": 4,
+    }
+    return type_map[dtype]
+
+
+def dtype_to_funcs(dtype):
+    type_map = {
+        "float16": list(filter(lambda x: "VectorAdd_half" in x, dir(ke))),
+        "float32": list(filter(lambda x: "VectorAdd_float" in x, dir(ke))),
+    }
+    return type_map[dtype]
+
+
+@pytest.mark.skip(reason="called by test_vector_add_all_sizes")
+def test_vector_add(size, dtype, func):
+    np.random.seed(0)
+    x = np.random.rand(size).astype(dtype)
+    y = np.random.rand(size).astype(dtype)
+    z = np.random.rand(size).astype(dtype)
+
+    x_d = ke.DeviceArray(x)
+    y_d = ke.DeviceArray(y)
+    z_d = ke.DeviceArray(z)
+    f = getattr(ke, func)
+    va = f(x_d, y_d, z_d, size)
+    va.Run()
+    z_d.UpdateHostNumpyArray()
+
+    z_ref = x + y
+    np.testing.assert_allclose(z_ref, z)
+
+
+@pytest.mark.parametrize("size", [1, 3, 4, 16, 124, 125, 126, 127, 128, 129, 130, 131, 132, 1024])
+def test_vector_add_all_sizes(size):
+    dtypes = ["float16", "float32"]
+    for dtype in dtypes:
+        for f in dtype_to_funcs(dtype):
+            test_vector_add(size, dtype, f)
+
+
+def profile_vector_add_func(size, dtype, func):
+    np.random.seed(0)
+    x = np.random.rand(size).astype(dtype)
+    y = np.random.rand(size).astype(dtype)
+    z = np.random.rand(size).astype(dtype)
+
+    x_d = ke.DeviceArray(x)
+    y_d = ke.DeviceArray(y)
+    z_d = ke.DeviceArray(z)
+    f = getattr(ke, func)
+    va = f(x_d, y_d, z_d, size)
+    t = va.Profile()
+    print(dtype, size, f, f"{t*1000:.2f} us", f"{size*3*(dtype_to_bytes(dtype))*1e3/t/1e9:.2f} GB/s")
+
+
+def profile():
+    sizes = [10000, 100000, 1000000, 10000000]
+    dtypes = ["float16", "float32"]
+    for dt in dtypes:
+        for s in sizes:
+            for f in dtype_to_funcs(dt):
+                profile_vector_add_func(s, dt, f)
+            print()
+        print()
+
+
+if __name__ == "__main__":
+    profile()
diff --git a/onnxruntime/python/tools/kernel_explorer/operator.h b/onnxruntime/python/tools/kernel_explorer/operator.h
new file mode 100644
index 0000000000..2d27cd2714
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/operator.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "timer.h"
+
+template <typename T>
+class Operator {
+ public:
+  Operator() : repeats_(100) {}
+
+  virtual void Run() = 0;
+
+  void SetRepeats(int n) {
+    repeats_ = n;
+  }
+
+  float Profile() {
+    // warm up
+    for (int i = 0; i < 5; i++) {
+      Run();
+    }
+    Timer timer;
+    timer.Start();
+    for (int i = 0; i < repeats_; i++) {
+      Run();
+    }
+    timer.End();
+    return timer.time()/repeats_;
+  }
+
+ private:
+  int repeats_;
+};
diff --git a/onnxruntime/python/tools/kernel_explorer/timer.cpp b/onnxruntime/python/tools/kernel_explorer/timer.cpp
new file mode 100644
index 0000000000..1a38b6329d
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/timer.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "timer.h"
+
+Timer::Timer() {
+  hipEventCreate(&start_);
+  hipEventCreate(&end_);
+}
+
+void Timer::Start() {
+  hipDeviceSynchronize();
+  hipEventRecord(start_, nullptr);
+}
+
+void Timer::End() {
+  hipEventRecord(end_, nullptr);
+  hipEventSynchronize(end_);
+}
+
+float Timer::time() {
+  float time;
+  // time is in ms with a resolution of 1 us
+  hipEventElapsedTime(&time, start_, end_);
+  return time;
+}
+
+Timer::~Timer() {
+  hipEventDestroy(start_);
+  hipEventDestroy(end_);
+}
diff --git a/onnxruntime/python/tools/kernel_explorer/timer.h b/onnxruntime/python/tools/kernel_explorer/timer.h
new file mode 100644
index 0000000000..66bfb5511b
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/timer.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "hip/hip_runtime.h"
+
+class Timer {
+ public:
+  Timer();
+  void Start();
+  void End();
+  float time();
+  ~Timer();
+
+ private:
+  hipEvent_t start_, end_; 
+};