Update CK and fix performance issue on dev machine (#13531)

1. Update CK to its latest develop branch
2. `-mllvm -amdgpu-early-inline-all=true` is critical to CK's
performance, ensure it is properly configured.
- The flags are propagated from target `hip-lang::device`'s
`INTERFACE_COMPILE_OPTIONS`, we must not manually add the flags.
- Instead, we must ensure this target is properly configured by checking
_CMAKE_HIP_DEVICE_RUNTIME_TARGET is set.

TL,DR

`hip-lang::device` sometime will be not be properly configured if our
`CMAKE_PREFIX_PATH` is not configured carefully. In the CI docker, the
configuration is in good state, but on dev machine it is not, which then
silently result poor performance for kernels. We fixed it in this PR and
add a guard to avoid unsuccessful future editing and to prevent
convoluted debugging process.

`_CMAKE_HIP_DEVICE_RUNTIME_TARGET ` is shared in
`/opt/rocm/lib/cmake/hip-lang/hip-lang-config.cmake` and it is internal
to
[CMake](https://gitlab.kitware.com/cmake/cmake/-/merge_requests/6121/diffs),
the variable name will not be changed in the foreseeable future.
This commit is contained in:
cloudhan 2022-11-03 19:32:30 +08:00 committed by GitHub
parent 7c3a23c186
commit 2de883c592
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 49 additions and 63 deletions

View file

@ -1869,6 +1869,9 @@ if (onnxruntime_USE_ROCM)
message(FATAL_ERROR "ROCM does not support build with CUDA!")
endif()
# NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
set(ROCM_PATH ${onnxruntime_ROCM_HOME})
if (NOT CMAKE_HIP_COMPILER)
@ -1879,15 +1882,24 @@ if (onnxruntime_USE_ROCM)
set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a;gfx1030")
endif()
file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)
list(APPEND CMAKE_PREFIX_PATH ${rocm_cmake_components})
enable_language(HIP)
# NOTE: Flags -mllvm -amdgpu-early-inline-all=true are critical for gpu kernel code performance. -mllvm passes the
# next flag to underlying LLVM instead of clang and -amdgpu-early-inline-all=true is the optimization flag for LLVM.
# With CMake's enable_language(HIP), additional flags including the proceeding one are propagated from
# hip-lang::device library. But in some weird cases, the hip-lang::device target may not be properly configured, for
# example, the CMAKE_PREFIX_PATH might be improperly configured.
if(NOT DEFINED _CMAKE_HIP_DEVICE_RUNTIME_TARGET)
message(FATAL_ERROR "HIP Language is not properly configured.")
endif()
add_compile_options("$<$<COMPILE_LANGUAGE:HIP>:SHELL:-x hip>")
if (NOT onnxruntime_HIPIFY_PERL)
set(onnxruntime_HIPIFY_PERL ${onnxruntime_ROCM_HOME}/hip/bin/hipify-perl)
endif()
# NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
find_package(HIP)
enable_language(HIP)
# replicate strategy used by pytorch to get ROCM_VERSION
# https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173
file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)

View file

@ -1,5 +1,5 @@
set(composable_kernel_URL https://github.com/ROCmSoftwarePlatform/composable_kernel.git)
set(composable_kernel_TAG e1a3fff67510be2af023b31587e411230b994631) # 2022-08-25 07:43:43 +0800
set(composable_kernel_TAG 8ee36118be9b19b15c2471bffeeeb624afb14044) # 2022-11-01 00:24:25 +0800
set(PATCH ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Fix_Clang_Build.patch)

View file

@ -40,12 +40,11 @@ target_link_libraries(kernel_explorer
# Currently we shall not use composablekernels::device_operations, the target includes all conv dependencies, which
# are extremely slow to compile. Instead, we only link all gemm related objects. See the following link on updating.
# https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/85978e0201/library/src/tensor_operation_instance/gpu/CMakeLists.txt#L33-L54
device_gemm_instance
${HIP_LIB})
device_gemm_instance)
target_compile_definitions(kernel_explorer
PUBLIC ROCM_USE_FLOAT16
PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,COMPILE_DEFINITIONS>)
target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1)
target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
add_dependencies(kernel_explorer onnxruntime_pybind11_state)

View file

@ -1264,18 +1264,15 @@ if (onnxruntime_USE_ROCM)
add_definitions(-DUSE_ROCM=1)
include(onnxruntime_rocm_hipify.cmake)
# Add search paths for default hip installation
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip ${onnxruntime_ROCM_HOME}/hcc ${onnxruntime_ROCM_HOME}/miopen ${onnxruntime_ROCM_HOME}/hiprand ${onnxruntime_ROCM_HOME}/rocrand)
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME}/rccl ${onnxruntime_ROCM_HOME}/roctracer)
set(CMAKE_MODULE_PATH "${onnxruntime_ROCM_HOME}/hip/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP)
find_package(hiprand REQUIRED)
find_library(HIP_LIB amdhip64 REQUIRED)
find_library(ROC_BLAS rocblas REQUIRED)
find_library(MIOPEN_LIB MIOpen REQUIRED)
find_package(rocblas REQUIRED)
find_package(MIOpen REQUIRED)
find_library(RCCL_LIB rccl REQUIRED)
find_library(ROCTRACER_LIB roctracer64 REQUIRED)
set(ONNXRUNTIME_ROCM_LIBS ${HIP_LIB} ${ROC_BLAS} ${MIOPEN_LIB} ${RCCL_LIB} ${ROCTRACER_LIB})
set(ONNXRUNTIME_ROCM_LIBS roc::rocblas MIOpen ${RCCL_LIB} ${ROCTRACER_LIB})
file(GLOB_RECURSE onnxruntime_providers_rocm_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/rocm/*.h"
@ -1342,30 +1339,12 @@ if (onnxruntime_USE_ROCM)
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_generated_training_ops_cc_srcs} ${onnxruntime_rocm_generated_training_ops_cu_srcs})
endif()
set(HIP_CXX_FLAGS -fPIC)
list(APPEND HIP_CXX_FLAGS -std=c++17)
if(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND HIP_CXX_FLAGS -g)
#list(APPEND HIP_CXX_FLAGS -O0)
endif(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
list(APPEND HIP_CLANG_FLAGS ${CMAKE_HIP_FLAGS})
# Generate GPU code during compilation
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
# Generate GPU code
foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
list(APPEND HIP_CLANG_FLAGS --offload-arch=${HIP_ARCH})
endforeach()
auto_set_source_files_hip_language(${onnxruntime_providers_rocm_src})
onnxruntime_add_shared_library_module(onnxruntime_providers_rocm ${onnxruntime_providers_rocm_src})
target_compile_options(onnxruntime_providers_rocm PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
if(NOT MSVC)
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1)
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-sign-compare)
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-unused-parameter)
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-undefined-var-template)
endif()

View file

@ -117,7 +117,7 @@ if(onnxruntime_USE_CANN)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include)
endif()
if(onnxruntime_USE_ROCM)
target_compile_options(onnxruntime_pybind11_state PUBLIC -D__HIP_PLATFORM_HCC__=1)
target_compile_options(onnxruntime_pybind11_state PUBLIC -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
endif()
if (onnxruntime_USE_NCCL)

View file

@ -47,7 +47,7 @@ if (onnxruntime_USE_CUDA)
target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
endif()
if (onnxruntime_USE_ROCM)
target_compile_options(onnxruntime_session PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1)
target_compile_options(onnxruntime_session PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hipcub/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include)
# ROCM provider sources are generated, need to add include directory for generated headers
target_include_directories(onnxruntime_session PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)

View file

@ -100,7 +100,7 @@ if (onnxruntime_BUILD_UNIT_TESTS)
endif()
if (onnxruntime_USE_ROCM)
target_compile_options(onnxruntime_training_runner PUBLIC -D__HIP_PLATFORM_HCC__=1)
target_compile_options(onnxruntime_training_runner PUBLIC -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
endif()
set_target_properties(onnxruntime_training_runner PROPERTIES FOLDER "ONNXRuntimeTest")

View file

@ -586,12 +586,6 @@ if(onnxruntime_USE_XNNPACK)
list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_xnnpack)
endif()
if(onnxruntime_USE_ROCM)
find_library(HIP_LIB amdhip64 REQUIRED)
list(APPEND onnxruntime_test_providers_libs ${HIP_LIB})
endif()
if(WIN32)
if (onnxruntime_USE_TVM)
list(APPEND disabled_warnings ${DISABLED_WARNINGS_FOR_TVM})
@ -754,7 +748,7 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
target_link_libraries(onnxruntime_test_all PRIVATE onnxruntime_language_interop onnxruntime_pyop)
endif()
if (onnxruntime_USE_ROCM)
target_compile_options(onnxruntime_test_all PRIVATE -D__HIP_PLATFORM_HCC__=1)
target_compile_options(onnxruntime_test_all PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
target_include_directories(onnxruntime_test_all PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
endif()
if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)

View file

@ -1,5 +1,5 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e1174ec..65648cb7 100644
index 5655ba17..1252d1ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
@ -11,7 +11,7 @@ index 3e1174ec..65648cb7 100644
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
@@ -36,27 +36,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -41,27 +41,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
@ -39,7 +39,16 @@ index 3e1174ec..65648cb7 100644
## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary.
@@ -245,9 +224,6 @@ rocm_package_setup_component(tests
@@ -83,8 +62,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
-link_libraries(hip::device)
-add_compile_definitions(__HIP_PLATFORM_HCC__=1)
## tidy
include(EnableCompilerWarnings)
@@ -263,9 +240,6 @@ rocm_package_setup_component(tests
)
add_subdirectory(library)
@ -49,7 +58,7 @@ index 3e1174ec..65648cb7 100644
#Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers)
@@ -273,11 +249,3 @@ rocm_install(FILES
@@ -291,11 +265,3 @@ rocm_install(FILES
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
@ -62,7 +71,7 @@ index 3e1174ec..65648cb7 100644
- HEADER_ONLY
-)
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index fcaec592..8ea06421 100644
index 92018aac..2ada620c 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -126,7 +126,9 @@
@ -76,23 +85,16 @@ index fcaec592..8ea06421 100644
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 6f3f900b..594d983d 100644
index c206c4dc..e45fac9d 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -1,5 +1,6 @@
@@ -1,7 +1,9 @@
function(add_instance_library INSTANCE_NAME)
message("adding instance ${INSTANCE_NAME}")
+ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
target_compile_features(${INSTANCE_NAME} PUBLIC)
+ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1")
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 5dc20332..78eedca5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(device_grouped_conv3d_fwd_instance
+add_instance_library(device_grouped_conv3d_fwd_instance
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
clang_tidy_check(${INSTANCE_NAME})
endfunction(add_instance_library INSTANCE_NAME)