onnxruntime/cmake/onnxruntime_kernel_explorer.cmake

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

include(CheckLanguage)

if(NOT onnxruntime_ENABLE_PYTHON)
  message(FATAL_ERROR "python is required but is not enabled")
endif()

set(KERNEL_EXPLORER_ROOT ${ONNXRUNTIME_ROOT}/python/tools/kernel_explorer)

if (onnxruntime_USE_CUDA)
  check_language(CUDA)
  set(LANGUAGE CUDA)
  set(BERT_DIR ${ONNXRUNTIME_ROOT}/contrib_ops/cuda/bert)
elseif(onnxruntime_USE_ROCM)
  check_language(HIP)
  set(LANGUAGE HIP)
  if (onnxruntime_USE_COMPOSABLE_KERNEL)
    include(composable_kernel)
  endif()
  if (onnxruntime_USE_HIPBLASLT)
    find_package(hipblaslt REQUIRED)
  endif()
  set(BERT_DIR ${ONNXRUNTIME_ROOT}/contrib_ops/rocm/bert)
endif()

file(GLOB kernel_explorer_srcs CONFIGURE_DEPENDS
  "${KERNEL_EXPLORER_ROOT}/*.cc"
  "${KERNEL_EXPLORER_ROOT}/*.h"
)

file(GLOB kernel_explorer_kernel_srcs CONFIGURE_DEPENDS
  "${KERNEL_EXPLORER_ROOT}/kernels/*.cc"
  "${KERNEL_EXPLORER_ROOT}/kernels/*.h"
  "${KERNEL_EXPLORER_ROOT}/kernels/*.cu"
  "${KERNEL_EXPLORER_ROOT}/kernels/*.cuh"
)

onnxruntime_add_shared_library_module(kernel_explorer ${kernel_explorer_srcs} ${kernel_explorer_kernel_srcs})
set_target_properties(kernel_explorer PROPERTIES PREFIX "_")
target_include_directories(kernel_explorer PUBLIC
  $<TARGET_PROPERTY:onnxruntime_pybind11_state,INCLUDE_DIRECTORIES>
  ${KERNEL_EXPLORER_ROOT})
target_link_libraries(kernel_explorer PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,LINK_LIBRARIES>)
target_compile_definitions(kernel_explorer PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,COMPILE_DEFINITIONS>)
target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare)

if (onnxruntime_USE_CUDA)
  file(GLOB kernel_explorer_cuda_kernel_srcs CONFIGURE_DEPENDS
    "${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cc"
    "${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.h"
    "${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cu"
    "${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cuh"
  )
  target_sources(kernel_explorer PRIVATE ${kernel_explorer_cuda_kernel_srcs})
  target_include_directories(kernel_explorer PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
elseif (onnxruntime_USE_ROCM)
  file(GLOB kernel_explorer_rocm_kernel_srcs CONFIGURE_DEPENDS
    "${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cc"
    "${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.h"
    "${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cu"
    "${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cuh"
  )
  auto_set_source_files_hip_language(${kernel_explorer_kernel_srcs} ${kernel_explorer_rocm_kernel_srcs})
  target_sources(kernel_explorer PRIVATE ${kernel_explorer_rocm_kernel_srcs})
  target_compile_definitions(kernel_explorer PRIVATE __HIP_PLATFORM_AMD__=1 __HIP_PLATFORM_HCC__=1 HIPBLAS_V2)
  if (onnxruntime_USE_COMPOSABLE_KERNEL)
    target_compile_definitions(kernel_explorer PRIVATE USE_COMPOSABLE_KERNEL)
    if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE)
      target_compile_definitions(kernel_explorer PRIVATE USE_COMPOSABLE_KERNEL_CK_TILE)
    endif()
    target_link_libraries(kernel_explorer PRIVATE onnxruntime_composable_kernel_includes)
  endif()
  if (onnxruntime_USE_TRITON_KERNEL)
    target_compile_definitions(kernel_explorer PRIVATE USE_TRITON_KERNEL)
  endif()
  if (onnxruntime_USE_HIPBLASLT)
    target_compile_definitions(kernel_explorer PRIVATE USE_HIPBLASLT)
  endif()
  if (onnxruntime_USE_ROCBLAS_EXTENSION_API)
    target_compile_definitions(kernel_explorer PRIVATE USE_ROCBLAS_EXTENSION_API)
    target_compile_definitions(kernel_explorer PRIVATE ROCBLAS_NO_DEPRECATED_WARNINGS)
    target_compile_definitions(kernel_explorer PRIVATE ROCBLAS_BETA_FEATURES_API)
  endif()
endif()

add_dependencies(kernel_explorer onnxruntime_pybind11_state)

enable_testing()
find_package(Python COMPONENTS Interpreter REQUIRED)
# add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
Rework cmake for kernel_explorer (#12079) Improve CMake for deep integration with ORT, so that we can easily hook ort function of microbenchmarking purpose. 2022-07-13 07:43:32 +00:00			`# Copyright (c) Microsoft Corporation. All rights reserved.`
			`# Licensed under the MIT License.`

Share TunableOp between CUDA and ROCM EP (#13560) Make TunableOp to support CUDA kernel authoring and add the corresponding supports for kernel explorer 2022-11-11 05:56:44 +00:00			`include(CheckLanguage)`
Rework cmake for kernel_explorer (#12079) Improve CMake for deep integration with ORT, so that we can easily hook ort function of microbenchmarking purpose. 2022-07-13 07:43:32 +00:00
			`if(NOT onnxruntime_ENABLE_PYTHON)`
			`message(FATAL_ERROR "python is required but is not enabled")`
			`endif()`

			`set(KERNEL_EXPLORER_ROOT ${ONNXRUNTIME_ROOT}/python/tools/kernel_explorer)`

Share TunableOp between CUDA and ROCM EP (#13560) Make TunableOp to support CUDA kernel authoring and add the corresponding supports for kernel explorer 2022-11-11 05:56:44 +00:00			`if (onnxruntime_USE_CUDA)`
			`check_language(CUDA)`
			`set(LANGUAGE CUDA)`
			`set(BERT_DIR ${ONNXRUNTIME_ROOT}/contrib_ops/cuda/bert)`
			`elseif(onnxruntime_USE_ROCM)`
			`check_language(HIP)`
			`set(LANGUAGE HIP)`
Make CK an optional dependencies and only built with ck if ROCm >= 5.3 (#14232) Recently, ck dropped ROCm 5.2 support, which is causing packaging pipeline failures. This PR workaround it. 2023-01-12 09:09:40 +00:00			`if (onnxruntime_USE_COMPOSABLE_KERNEL)`
			`include(composable_kernel)`
			`endif()`
[ROCm] add hipblaslt into GemmFastGelu TunableOp (#15945) add hipblaslt into GemmFastGelu TunableOp. 2023-05-23 03:07:09 +00:00			`if (onnxruntime_USE_HIPBLASLT)`
			`find_package(hipblaslt REQUIRED)`
			`endif()`
Share TunableOp between CUDA and ROCM EP (#13560) Make TunableOp to support CUDA kernel authoring and add the corresponding supports for kernel explorer 2022-11-11 05:56:44 +00:00			`set(BERT_DIR ${ONNXRUNTIME_ROOT}/contrib_ops/rocm/bert)`
			`endif()`

			`file(GLOB kernel_explorer_srcs CONFIGURE_DEPENDS`
			`"${KERNEL_EXPLORER_ROOT}/*.cc"`
			`"${KERNEL_EXPLORER_ROOT}/*.h"`
			`)`
Update ROCm CI to use HIP LANGUAGE (#13214) Update for ROCm CI before reland tunable GEMM #12853. This PR also update composable kernel to use CMakes's HIP language support so that we can mix C/C++ compiler with HIP compiler instead of locking to hip-clang 2022-10-05 08:15:16 +00:00
Share TunableOp between CUDA and ROCM EP (#13560) Make TunableOp to support CUDA kernel authoring and add the corresponding supports for kernel explorer 2022-11-11 05:56:44 +00:00			`file(GLOB kernel_explorer_kernel_srcs CONFIGURE_DEPENDS`
			`"${KERNEL_EXPLORER_ROOT}/kernels/*.cc"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/*.h"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/*.cu"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/*.cuh"`
			`)`
Rework cmake for kernel_explorer (#12079) Improve CMake for deep integration with ORT, so that we can easily hook ort function of microbenchmarking purpose. 2022-07-13 07:43:32 +00:00
Share TunableOp between CUDA and ROCM EP (#13560) Make TunableOp to support CUDA kernel authoring and add the corresponding supports for kernel explorer 2022-11-11 05:56:44 +00:00			`onnxruntime_add_shared_library_module(kernel_explorer ${kernel_explorer_srcs} ${kernel_explorer_kernel_srcs})`
Rework cmake for kernel_explorer (#12079) Improve CMake for deep integration with ORT, so that we can easily hook ort function of microbenchmarking purpose. 2022-07-13 07:43:32 +00:00			`set_target_properties(kernel_explorer PROPERTIES PREFIX "_")`
			`target_include_directories(kernel_explorer PUBLIC`
			`$<TARGET_PROPERTY:onnxruntime_pybind11_state,INCLUDE_DIRECTORIES>`
			`${KERNEL_EXPLORER_ROOT})`
Share TunableOp between CUDA and ROCM EP (#13560) Make TunableOp to support CUDA kernel authoring and add the corresponding supports for kernel explorer 2022-11-11 05:56:44 +00:00			`target_link_libraries(kernel_explorer PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,LINK_LIBRARIES>)`
			`target_compile_definitions(kernel_explorer PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,COMPILE_DEFINITIONS>)`
			`target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare)`

			`if (onnxruntime_USE_CUDA)`
			`file(GLOB kernel_explorer_cuda_kernel_srcs CONFIGURE_DEPENDS`
			`"${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cc"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.h"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cu"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cuh"`
			`)`
			`target_sources(kernel_explorer PRIVATE ${kernel_explorer_cuda_kernel_srcs})`
Add TuningContext for TunableOp (#14557) This makes the the TunableOp tuning results state free and will allow us to dump and load offline tuning results. 2023-02-10 06:27:43 +00:00			`target_include_directories(kernel_explorer PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})`
Share TunableOp between CUDA and ROCM EP (#13560) Make TunableOp to support CUDA kernel authoring and add the corresponding supports for kernel explorer 2022-11-11 05:56:44 +00:00			`elseif (onnxruntime_USE_ROCM)`
			`file(GLOB kernel_explorer_rocm_kernel_srcs CONFIGURE_DEPENDS`
			`"${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cc"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.h"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cu"`
			`"${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cuh"`
			`)`
			`auto_set_source_files_hip_language(${kernel_explorer_kernel_srcs} ${kernel_explorer_rocm_kernel_srcs})`
			`target_sources(kernel_explorer PRIVATE ${kernel_explorer_rocm_kernel_srcs})`
[ROCm] prefer hip interfaces over roc during hipify (#22394) ### Description Change the hipify step to remove the -roc option to hipify-perl. This will prefer hipblas over rocblas. rocblas can still be called directly such as in TunableOp. ### Motivation and Context hip interfaces are preferred over roc for porting from cuda to hip. Calling roc interfaces is meant for ROCm-specific enhancements or extensions. 2024-10-15 03:34:03 +00:00			`target_compile_definitions(kernel_explorer PRIVATE __HIP_PLATFORM_AMD__=1 __HIP_PLATFORM_HCC__=1 HIPBLAS_V2)`
Make CK an optional dependencies and only built with ck if ROCm >= 5.3 (#14232) Recently, ck dropped ROCm 5.2 support, which is causing packaging pipeline failures. This PR workaround it. 2023-01-12 09:09:40 +00:00			`if (onnxruntime_USE_COMPOSABLE_KERNEL)`
			`target_compile_definitions(kernel_explorer PRIVATE USE_COMPOSABLE_KERNEL)`
[ROCm] Update ck to use ck_tile (#21030) 2024-06-19 06:06:10 +00:00			`if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE)`
			`target_compile_definitions(kernel_explorer PRIVATE USE_COMPOSABLE_KERNEL_CK_TILE)`
			`endif()`
Make CK an optional dependencies and only built with ck if ROCm >= 5.3 (#14232) Recently, ck dropped ROCm 5.2 support, which is causing packaging pipeline failures. This PR workaround it. 2023-01-12 09:09:40 +00:00			`target_link_libraries(kernel_explorer PRIVATE onnxruntime_composable_kernel_includes)`
			`endif()`
integrate triton into ort (#15862) ### Description In some scenarios, the triton written kernels are more performant than CK or other handwritten kernels, so we implement a framework that onnxruntime can use these triton written kernels. This PR is to integrate triton into ort, so that ort can use kernels that written and compiled by triton. The main change focus on two part: 1. a build part to compile triton written kernel and combine these kernels into libonnxruntime_providers_rocm.so 2. a loader and launcher in c++, for loading and launch triton written kernels. #### Build To compile triton written kernel, add a script `tools/ci_build/compile_triton.py`. This script will dynamic load all kernel files, compile them, and generate `triton_kernel_infos.a` and `triton_kernel_infos.h`. `triton_kernel_infos.a` contains all compiled kernel instructions, this file will be combined into libonnxruntime_providers_rocm.so, using --whole-archive flag. `triton_kernel_infos.h` defines a const array that contains all the metadata for each compiled kernel. These metadata will be used for load and launch. So this header file is included by 'triton_kernel.cu' which defines load and launch functions. Add a build flag in build.py and CMakeList.txt, when building rocm provider, it will call triton_kernel build command, and generate all necessary files. #### C++ Load and Launch On c++ part, we implement load and launch functions in triton_kernel.cu and triton_kernel.h. These two files located in `providers/cuda`, and when compiling rocm, they will be hipified. so this part supports both cuda and rocm. But currently we only call triton kernel in rocm. We also implement a softmax triton op for example. Because there will generate many kernels for different input shape of softmax, we use TunableOp to select the best one. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> 2023-05-17 01:35:28 +00:00			`if (onnxruntime_USE_TRITON_KERNEL)`
			`target_compile_definitions(kernel_explorer PRIVATE USE_TRITON_KERNEL)`
			`endif()`
[ROCm] add hipblaslt into GemmFastGelu TunableOp (#15945) add hipblaslt into GemmFastGelu TunableOp. 2023-05-23 03:07:09 +00:00			`if (onnxruntime_USE_HIPBLASLT)`
			`target_compile_definitions(kernel_explorer PRIVATE USE_HIPBLASLT)`
			`endif()`
[ROCm] TunableOp: Update rocBLAS get_solutions API (since ROCm5.6) (#16657) ### Description - Update existing rocBLAS get_solutions API using `*_get_solutions_by_type` (supported from ROCm5.6); remove the original nested TunableOp logic. - Update kernel_explorer. 2023-07-13 03:20:26 +00:00			`if (onnxruntime_USE_ROCBLAS_EXTENSION_API)`
			`target_compile_definitions(kernel_explorer PRIVATE USE_ROCBLAS_EXTENSION_API)`
			`target_compile_definitions(kernel_explorer PRIVATE ROCBLAS_NO_DEPRECATED_WARNINGS)`
			`target_compile_definitions(kernel_explorer PRIVATE ROCBLAS_BETA_FEATURES_API)`
			`endif()`
Share TunableOp between CUDA and ROCM EP (#13560) Make TunableOp to support CUDA kernel authoring and add the corresponding supports for kernel explorer 2022-11-11 05:56:44 +00:00			`endif()`
Add baseline gemm for kernel explorer (#12050) Use rocblasGemmHelper gemm wrapper from ORT and profile for bert param size only. 2022-07-20 05:49:26 +00:00
Rework cmake for kernel_explorer (#12079) Improve CMake for deep integration with ORT, so that we can easily hook ort function of microbenchmarking purpose. 2022-07-13 07:43:32 +00:00			`add_dependencies(kernel_explorer onnxruntime_pybind11_state)`

			`enable_testing()`
			`find_package(Python COMPONENTS Interpreter REQUIRED)`
Add Linux ROCm CI Pipeline (#21798) ### Description * Add new ROCm CI pipeline (`Linux ROCm CI Pipeline`) focusing on inference. * Resolve test errors; disable flaky tests. based on test PR #21614. 2024-08-30 06:50:32 +00:00			`# add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)`