mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-17 21:10:43 +00:00
### Description <!-- Describe your changes. --> Add GemmFastGelu CK implementation. TODO 1. The performance of CK GemmFastGelu in ORT is not good as using CK directly, still need to investigate the reason and improve the CK in ORT. `GemmFastGeluUnfused float16 NN m=49152 n=3072 k=768 2298.8064 us 100.89 tflops` `withbias DeviceGemmMultipleD_Xdl_CShuffle<256, 256, 128, 32, 8, 8, Default> LoopScheduler: Default, PipelineVersion: v1 float16 NN m=49152 n=3072 k=768 2401.9799 us 96.56 tflops` ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Co-authored-by: peixuanzuo <peixuanzuo@linmif39a000004.zvflicr54joexhdgnhvmxrxygg.phxx.internal.cloudapp.net>
86 lines
3.2 KiB
Diff
86 lines
3.2 KiB
Diff
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
|
index f861e302..f0b6bcea 100644
|
|
--- a/CMakeLists.txt
|
|
+++ b/CMakeLists.txt
|
|
@@ -1,7 +1,7 @@
|
|
cmake_minimum_required(VERSION 3.14)
|
|
|
|
# Check support for CUDA/HIP in Cmake
|
|
-project(composable_kernel)
|
|
+project(composable_kernel LANGUAGES CXX HIP)
|
|
|
|
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
|
|
|
|
@@ -41,27 +41,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
|
|
|
|
-## OpenMP
|
|
-if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|
- # workaround issue hipcc in rocm3.5 cannot find openmp
|
|
- set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
|
|
- set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
|
|
- set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
|
|
- set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
|
|
- set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
|
|
- set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
|
|
-else()
|
|
- find_package(OpenMP REQUIRED)
|
|
-endif()
|
|
-
|
|
-message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
|
|
-message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
|
|
-message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
|
|
-message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
|
|
-
|
|
-link_libraries(${OpenMP_gomp_LIBRARY})
|
|
-link_libraries(${OpenMP_pthread_LIBRARY})
|
|
-
|
|
## HIP
|
|
find_package(HIP REQUIRED)
|
|
# Override HIP version in config.h, if necessary.
|
|
@@ -83,8 +62,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
|
|
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
|
|
endif()
|
|
message(STATUS "Build with HIP ${HIP_VERSION}")
|
|
-link_libraries(hip::device)
|
|
-add_compile_definitions(__HIP_PLATFORM_HCC__=1)
|
|
|
|
## tidy
|
|
include(EnableCompilerWarnings)
|
|
@@ -273,9 +250,6 @@ rocm_package_setup_component(profiler
|
|
)
|
|
|
|
add_subdirectory(library)
|
|
-add_subdirectory(example)
|
|
-add_subdirectory(test)
|
|
-add_subdirectory(profiler)
|
|
|
|
#Create an interface target for the include only files and call it "composablekernels"
|
|
include(CMakePackageConfigHelpers)
|
|
@@ -301,11 +275,3 @@ rocm_install(FILES
|
|
|
|
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
|
|
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
|
|
-
|
|
-rocm_create_package(
|
|
- NAME composablekernel
|
|
- DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
|
|
- MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
|
|
- LDCONFIG
|
|
- HEADER_ONLY
|
|
-)
|
|
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
|
index c206c4dc..e45fac9d 100644
|
|
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
|
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
|
@@ -1,7 +1,9 @@
|
|
function(add_instance_library INSTANCE_NAME)
|
|
message("adding instance ${INSTANCE_NAME}")
|
|
+ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
|
|
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
|
|
target_compile_features(${INSTANCE_NAME} PUBLIC)
|
|
+ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1")
|
|
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
clang_tidy_check(${INSTANCE_NAME})
|
|
endfunction(add_instance_library INSTANCE_NAME)
|