mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-16 21:00:14 +00:00
### Description <!-- Describe your changes. --> Add GemmFastGelu CK implementation. TODO 1. The performance of CK GemmFastGelu in ORT is not good as using CK directly, still need to investigate the reason and improve the CK in ORT. `GemmFastGeluUnfused float16 NN m=49152 n=3072 k=768 2298.8064 us 100.89 tflops` `withbias DeviceGemmMultipleD_Xdl_CShuffle<256, 256, 128, 32, 8, 8, Default> LoopScheduler: Default, PipelineVersion: v1 float16 NN m=49152 n=3072 k=768 2401.9799 us 96.56 tflops` ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Co-authored-by: peixuanzuo <peixuanzuo@linmif39a000004.zvflicr54joexhdgnhvmxrxygg.phxx.internal.cloudapp.net>
69 lines
2.7 KiB
CMake
69 lines
2.7 KiB
CMake
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License.
|
|
|
|
include(CheckLanguage)
|
|
|
|
if(NOT onnxruntime_ENABLE_PYTHON)
|
|
message(FATAL_ERROR "python is required but is not enabled")
|
|
endif()
|
|
|
|
set(KERNEL_EXPLORER_ROOT ${ONNXRUNTIME_ROOT}/python/tools/kernel_explorer)
|
|
|
|
if (onnxruntime_USE_CUDA)
|
|
check_language(CUDA)
|
|
set(LANGUAGE CUDA)
|
|
set(BERT_DIR ${ONNXRUNTIME_ROOT}/contrib_ops/cuda/bert)
|
|
elseif(onnxruntime_USE_ROCM)
|
|
check_language(HIP)
|
|
set(LANGUAGE HIP)
|
|
include(composable_kernel)
|
|
set(BERT_DIR ${ONNXRUNTIME_ROOT}/contrib_ops/rocm/bert)
|
|
endif()
|
|
|
|
file(GLOB kernel_explorer_srcs CONFIGURE_DEPENDS
|
|
"${KERNEL_EXPLORER_ROOT}/*.cc"
|
|
"${KERNEL_EXPLORER_ROOT}/*.h"
|
|
)
|
|
|
|
file(GLOB kernel_explorer_kernel_srcs CONFIGURE_DEPENDS
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/*.cc"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/*.h"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/*.cu"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/*.cuh"
|
|
)
|
|
|
|
onnxruntime_add_shared_library_module(kernel_explorer ${kernel_explorer_srcs} ${kernel_explorer_kernel_srcs})
|
|
set_target_properties(kernel_explorer PROPERTIES PREFIX "_")
|
|
target_include_directories(kernel_explorer PUBLIC
|
|
$<TARGET_PROPERTY:onnxruntime_pybind11_state,INCLUDE_DIRECTORIES>
|
|
${KERNEL_EXPLORER_ROOT})
|
|
target_link_libraries(kernel_explorer PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,LINK_LIBRARIES>)
|
|
target_compile_definitions(kernel_explorer PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,COMPILE_DEFINITIONS>)
|
|
target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare)
|
|
|
|
if (onnxruntime_USE_CUDA)
|
|
file(GLOB kernel_explorer_cuda_kernel_srcs CONFIGURE_DEPENDS
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cc"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.h"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cu"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/cuda/*.cuh"
|
|
)
|
|
target_sources(kernel_explorer PRIVATE ${kernel_explorer_cuda_kernel_srcs})
|
|
elseif (onnxruntime_USE_ROCM)
|
|
file(GLOB kernel_explorer_rocm_kernel_srcs CONFIGURE_DEPENDS
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cc"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.h"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cu"
|
|
"${KERNEL_EXPLORER_ROOT}/kernels/rocm/*.cuh"
|
|
)
|
|
auto_set_source_files_hip_language(${kernel_explorer_kernel_srcs} ${kernel_explorer_rocm_kernel_srcs})
|
|
target_sources(kernel_explorer PRIVATE ${kernel_explorer_rocm_kernel_srcs})
|
|
target_compile_definitions(kernel_explorer PRIVATE __HIP_PLATFORM_AMD__=1 __HIP_PLATFORM_HCC__=1)
|
|
target_link_libraries(kernel_explorer PRIVATE onnxruntime_composable_kernel_includes)
|
|
endif()
|
|
|
|
add_dependencies(kernel_explorer onnxruntime_pybind11_state)
|
|
|
|
enable_testing()
|
|
find_package(Python COMPONENTS Interpreter REQUIRED)
|
|
add_test(NAME test_kernels COMMAND ${Python_EXECUTABLE} -m pytest ..)
|