onnxruntime/cmake/patches/composable_kernel/Fix_Clang_Build.patch
cloudhan 2de883c592
Update CK and fix performance issue on dev machine (#13531)
1. Update CK to its latest develop branch
2. `-mllvm -amdgpu-early-inline-all=true` is critical to CK's
performance, ensure it is properly configured.
- The flags are propagated from target `hip-lang::device`'s
`INTERFACE_COMPILE_OPTIONS`, we must not manually add the flags.
- Instead, we must ensure this target is properly configured by checking
_CMAKE_HIP_DEVICE_RUNTIME_TARGET is set.

TL,DR

`hip-lang::device` sometime will be not be properly configured if our
`CMAKE_PREFIX_PATH` is not configured carefully. In the CI docker, the
configuration is in good state, but on dev machine it is not, which then
silently result poor performance for kernels. We fixed it in this PR and
add a guard to avoid unsuccessful future editing and to prevent
convoluted debugging process.

`_CMAKE_HIP_DEVICE_RUNTIME_TARGET ` is shared in
`/opt/rocm/lib/cmake/hip-lang/hip-lang-config.cmake` and it is internal
to
[CMake](https://gitlab.kitware.com/cmake/cmake/-/merge_requests/6121/diffs),
the variable name will not be changed in the foreseeable future.
2022-11-03 19:32:30 +08:00

100 lines
3.7 KiB
Diff

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5655ba17..1252d1ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.14)
# Check support for CUDA/HIP in Cmake
-project(composable_kernel)
+project(composable_kernel LANGUAGES CXX HIP)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
@@ -41,27 +41,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
-## OpenMP
-if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
- # workaround issue hipcc in rocm3.5 cannot find openmp
- set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
- set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
- set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
- set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
- set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
- set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
-else()
- find_package(OpenMP REQUIRED)
-endif()
-
-message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
-message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
-message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
-message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
-
-link_libraries(${OpenMP_gomp_LIBRARY})
-link_libraries(${OpenMP_pthread_LIBRARY})
-
## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary.
@@ -83,8 +62,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
-link_libraries(hip::device)
-add_compile_definitions(__HIP_PLATFORM_HCC__=1)
## tidy
include(EnableCompilerWarnings)
@@ -263,9 +240,6 @@ rocm_package_setup_component(tests
)
add_subdirectory(library)
-add_subdirectory(example)
-add_subdirectory(test)
-add_subdirectory(profiler)
#Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers)
@@ -291,11 +265,3 @@ rocm_install(FILES
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
-
-rocm_create_package(
- NAME composablekernel
- DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
- MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
- LDCONFIG
- HEADER_ONLY
-)
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 92018aac..2ada620c 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -126,7 +126,9 @@
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
// experimental feature: optimize for inter-wave scheduling policy
+#ifndef CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
+#endif
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index c206c4dc..e45fac9d 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -1,7 +1,9 @@
function(add_instance_library INSTANCE_NAME)
message("adding instance ${INSTANCE_NAME}")
+ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
target_compile_features(${INSTANCE_NAME} PUBLIC)
+ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1")
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
clang_tidy_check(${INSTANCE_NAME})
endfunction(add_instance_library INSTANCE_NAME)