mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
1. Update CK to its latest develop branch 2. `-mllvm -amdgpu-early-inline-all=true` is critical to CK's performance, ensure it is properly configured. - The flags are propagated from target `hip-lang::device`'s `INTERFACE_COMPILE_OPTIONS`, we must not manually add the flags. - Instead, we must ensure this target is properly configured by checking _CMAKE_HIP_DEVICE_RUNTIME_TARGET is set. TL,DR `hip-lang::device` sometime will be not be properly configured if our `CMAKE_PREFIX_PATH` is not configured carefully. In the CI docker, the configuration is in good state, but on dev machine it is not, which then silently result poor performance for kernels. We fixed it in this PR and add a guard to avoid unsuccessful future editing and to prevent convoluted debugging process. `_CMAKE_HIP_DEVICE_RUNTIME_TARGET ` is shared in `/opt/rocm/lib/cmake/hip-lang/hip-lang-config.cmake` and it is internal to [CMake](https://gitlab.kitware.com/cmake/cmake/-/merge_requests/6121/diffs), the variable name will not be changed in the foreseeable future.
100 lines
3.7 KiB
Diff
100 lines
3.7 KiB
Diff
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
|
index 5655ba17..1252d1ea 100644
|
|
--- a/CMakeLists.txt
|
|
+++ b/CMakeLists.txt
|
|
@@ -1,7 +1,7 @@
|
|
cmake_minimum_required(VERSION 3.14)
|
|
|
|
# Check support for CUDA/HIP in Cmake
|
|
-project(composable_kernel)
|
|
+project(composable_kernel LANGUAGES CXX HIP)
|
|
|
|
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
|
|
|
|
@@ -41,27 +41,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
|
|
|
|
-## OpenMP
|
|
-if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|
- # workaround issue hipcc in rocm3.5 cannot find openmp
|
|
- set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
|
|
- set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
|
|
- set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
|
|
- set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
|
|
- set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
|
|
- set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
|
|
-else()
|
|
- find_package(OpenMP REQUIRED)
|
|
-endif()
|
|
-
|
|
-message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
|
|
-message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
|
|
-message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
|
|
-message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
|
|
-
|
|
-link_libraries(${OpenMP_gomp_LIBRARY})
|
|
-link_libraries(${OpenMP_pthread_LIBRARY})
|
|
-
|
|
## HIP
|
|
find_package(HIP REQUIRED)
|
|
# Override HIP version in config.h, if necessary.
|
|
@@ -83,8 +62,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
|
|
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
|
|
endif()
|
|
message(STATUS "Build with HIP ${HIP_VERSION}")
|
|
-link_libraries(hip::device)
|
|
-add_compile_definitions(__HIP_PLATFORM_HCC__=1)
|
|
|
|
## tidy
|
|
include(EnableCompilerWarnings)
|
|
@@ -263,9 +240,6 @@ rocm_package_setup_component(tests
|
|
)
|
|
|
|
add_subdirectory(library)
|
|
-add_subdirectory(example)
|
|
-add_subdirectory(test)
|
|
-add_subdirectory(profiler)
|
|
|
|
#Create an interface target for the include only files and call it "composablekernels"
|
|
include(CMakePackageConfigHelpers)
|
|
@@ -291,11 +265,3 @@ rocm_install(FILES
|
|
|
|
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
|
|
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
|
|
-
|
|
-rocm_create_package(
|
|
- NAME composablekernel
|
|
- DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
|
|
- MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
|
|
- LDCONFIG
|
|
- HEADER_ONLY
|
|
-)
|
|
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
|
|
index 92018aac..2ada620c 100644
|
|
--- a/include/ck/ck.hpp
|
|
+++ b/include/ck/ck.hpp
|
|
@@ -126,7 +126,9 @@
|
|
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
|
|
|
|
// experimental feature: optimize for inter-wave scheduling policy
|
|
+#ifndef CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
|
|
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
|
|
+#endif
|
|
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
|
|
|
|
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
|
|
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
|
index c206c4dc..e45fac9d 100644
|
|
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
|
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
|
@@ -1,7 +1,9 @@
|
|
function(add_instance_library INSTANCE_NAME)
|
|
message("adding instance ${INSTANCE_NAME}")
|
|
+ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
|
|
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
|
|
target_compile_features(${INSTANCE_NAME} PUBLIC)
|
|
+ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1")
|
|
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
clang_tidy_check(${INSTANCE_NAME})
|
|
endfunction(add_instance_library INSTANCE_NAME)
|