mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-26 03:00:54 +00:00
Update CK and fix performance issue on dev machine (#13531)
1. Update CK to its latest develop branch 2. `-mllvm -amdgpu-early-inline-all=true` is critical to CK's performance, ensure it is properly configured. - The flags are propagated from target `hip-lang::device`'s `INTERFACE_COMPILE_OPTIONS`, we must not manually add the flags. - Instead, we must ensure this target is properly configured by checking _CMAKE_HIP_DEVICE_RUNTIME_TARGET is set. TL,DR `hip-lang::device` sometime will be not be properly configured if our `CMAKE_PREFIX_PATH` is not configured carefully. In the CI docker, the configuration is in good state, but on dev machine it is not, which then silently result poor performance for kernels. We fixed it in this PR and add a guard to avoid unsuccessful future editing and to prevent convoluted debugging process. `_CMAKE_HIP_DEVICE_RUNTIME_TARGET ` is shared in `/opt/rocm/lib/cmake/hip-lang/hip-lang-config.cmake` and it is internal to [CMake](https://gitlab.kitware.com/cmake/cmake/-/merge_requests/6121/diffs), the variable name will not be changed in the foreseeable future.
This commit is contained in:
parent
7c3a23c186
commit
2de883c592
9 changed files with 49 additions and 63 deletions
|
|
@ -1869,6 +1869,9 @@ if (onnxruntime_USE_ROCM)
|
|||
message(FATAL_ERROR "ROCM does not support build with CUDA!")
|
||||
endif()
|
||||
|
||||
# NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1
|
||||
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
|
||||
|
||||
set(ROCM_PATH ${onnxruntime_ROCM_HOME})
|
||||
|
||||
if (NOT CMAKE_HIP_COMPILER)
|
||||
|
|
@ -1879,15 +1882,24 @@ if (onnxruntime_USE_ROCM)
|
|||
set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a;gfx1030")
|
||||
endif()
|
||||
|
||||
file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)
|
||||
list(APPEND CMAKE_PREFIX_PATH ${rocm_cmake_components})
|
||||
|
||||
enable_language(HIP)
|
||||
# NOTE: Flags -mllvm -amdgpu-early-inline-all=true are critical for gpu kernel code performance. -mllvm passes the
|
||||
# next flag to underlying LLVM instead of clang and -amdgpu-early-inline-all=true is the optimization flag for LLVM.
|
||||
# With CMake's enable_language(HIP), additional flags including the proceeding one are propagated from
|
||||
# hip-lang::device library. But in some weird cases, the hip-lang::device target may not be properly configured, for
|
||||
# example, the CMAKE_PREFIX_PATH might be improperly configured.
|
||||
if(NOT DEFINED _CMAKE_HIP_DEVICE_RUNTIME_TARGET)
|
||||
message(FATAL_ERROR "HIP Language is not properly configured.")
|
||||
endif()
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:HIP>:SHELL:-x hip>")
|
||||
|
||||
if (NOT onnxruntime_HIPIFY_PERL)
|
||||
set(onnxruntime_HIPIFY_PERL ${onnxruntime_ROCM_HOME}/hip/bin/hipify-perl)
|
||||
endif()
|
||||
|
||||
# NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1
|
||||
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
|
||||
find_package(HIP)
|
||||
enable_language(HIP)
|
||||
|
||||
# replicate strategy used by pytorch to get ROCM_VERSION
|
||||
# https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173
|
||||
file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)
|
||||
|
|
|
|||
2
cmake/external/composable_kernel.cmake
vendored
2
cmake/external/composable_kernel.cmake
vendored
|
|
@ -1,5 +1,5 @@
|
|||
set(composable_kernel_URL https://github.com/ROCmSoftwarePlatform/composable_kernel.git)
|
||||
set(composable_kernel_TAG e1a3fff67510be2af023b31587e411230b994631) # 2022-08-25 07:43:43 +0800
|
||||
set(composable_kernel_TAG 8ee36118be9b19b15c2471bffeeeb624afb14044) # 2022-11-01 00:24:25 +0800
|
||||
|
||||
set(PATCH ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Fix_Clang_Build.patch)
|
||||
|
||||
|
|
|
|||
|
|
@ -40,12 +40,11 @@ target_link_libraries(kernel_explorer
|
|||
# Currently we shall not use composablekernels::device_operations, the target includes all conv dependencies, which
|
||||
# are extremely slow to compile. Instead, we only link all gemm related objects. See the following link on updating.
|
||||
# https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/85978e0201/library/src/tensor_operation_instance/gpu/CMakeLists.txt#L33-L54
|
||||
device_gemm_instance
|
||||
${HIP_LIB})
|
||||
device_gemm_instance)
|
||||
target_compile_definitions(kernel_explorer
|
||||
PUBLIC ROCM_USE_FLOAT16
|
||||
PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,COMPILE_DEFINITIONS>)
|
||||
target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1)
|
||||
target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
|
||||
|
||||
add_dependencies(kernel_explorer onnxruntime_pybind11_state)
|
||||
|
||||
|
|
|
|||
|
|
@ -1264,18 +1264,15 @@ if (onnxruntime_USE_ROCM)
|
|||
add_definitions(-DUSE_ROCM=1)
|
||||
include(onnxruntime_rocm_hipify.cmake)
|
||||
|
||||
# Add search paths for default hip installation
|
||||
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip ${onnxruntime_ROCM_HOME}/hcc ${onnxruntime_ROCM_HOME}/miopen ${onnxruntime_ROCM_HOME}/hiprand ${onnxruntime_ROCM_HOME}/rocrand)
|
||||
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME}/rccl ${onnxruntime_ROCM_HOME}/roctracer)
|
||||
|
||||
set(CMAKE_MODULE_PATH "${onnxruntime_ROCM_HOME}/hip/cmake" ${CMAKE_MODULE_PATH})
|
||||
find_package(HIP)
|
||||
find_package(hiprand REQUIRED)
|
||||
find_library(HIP_LIB amdhip64 REQUIRED)
|
||||
find_library(ROC_BLAS rocblas REQUIRED)
|
||||
find_library(MIOPEN_LIB MIOpen REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
find_package(MIOpen REQUIRED)
|
||||
find_library(RCCL_LIB rccl REQUIRED)
|
||||
find_library(ROCTRACER_LIB roctracer64 REQUIRED)
|
||||
set(ONNXRUNTIME_ROCM_LIBS ${HIP_LIB} ${ROC_BLAS} ${MIOPEN_LIB} ${RCCL_LIB} ${ROCTRACER_LIB})
|
||||
set(ONNXRUNTIME_ROCM_LIBS roc::rocblas MIOpen ${RCCL_LIB} ${ROCTRACER_LIB})
|
||||
|
||||
file(GLOB_RECURSE onnxruntime_providers_rocm_cc_srcs CONFIGURE_DEPENDS
|
||||
"${ONNXRUNTIME_ROOT}/core/providers/rocm/*.h"
|
||||
|
|
@ -1342,30 +1339,12 @@ if (onnxruntime_USE_ROCM)
|
|||
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_generated_training_ops_cc_srcs} ${onnxruntime_rocm_generated_training_ops_cu_srcs})
|
||||
endif()
|
||||
|
||||
set(HIP_CXX_FLAGS -fPIC)
|
||||
list(APPEND HIP_CXX_FLAGS -std=c++17)
|
||||
|
||||
if(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
list(APPEND HIP_CXX_FLAGS -g)
|
||||
#list(APPEND HIP_CXX_FLAGS -O0)
|
||||
endif(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
|
||||
list(APPEND HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
|
||||
list(APPEND HIP_CLANG_FLAGS ${CMAKE_HIP_FLAGS})
|
||||
|
||||
# Generate GPU code during compilation
|
||||
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
|
||||
|
||||
# Generate GPU code
|
||||
foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
|
||||
list(APPEND HIP_CLANG_FLAGS --offload-arch=${HIP_ARCH})
|
||||
endforeach()
|
||||
|
||||
auto_set_source_files_hip_language(${onnxruntime_providers_rocm_src})
|
||||
onnxruntime_add_shared_library_module(onnxruntime_providers_rocm ${onnxruntime_providers_rocm_src})
|
||||
target_compile_options(onnxruntime_providers_rocm PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
|
||||
|
||||
if(NOT MSVC)
|
||||
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1)
|
||||
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-sign-compare)
|
||||
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-unused-parameter)
|
||||
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-undefined-var-template)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -117,7 +117,7 @@ if(onnxruntime_USE_CANN)
|
|||
target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include)
|
||||
endif()
|
||||
if(onnxruntime_USE_ROCM)
|
||||
target_compile_options(onnxruntime_pybind11_state PUBLIC -D__HIP_PLATFORM_HCC__=1)
|
||||
target_compile_options(onnxruntime_pybind11_state PUBLIC -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
|
||||
target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
|
||||
endif()
|
||||
if (onnxruntime_USE_NCCL)
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ if (onnxruntime_USE_CUDA)
|
|||
target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
endif()
|
||||
if (onnxruntime_USE_ROCM)
|
||||
target_compile_options(onnxruntime_session PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1)
|
||||
target_compile_options(onnxruntime_session PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
|
||||
target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hipcub/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include)
|
||||
# ROCM provider sources are generated, need to add include directory for generated headers
|
||||
target_include_directories(onnxruntime_session PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
|
||||
|
|
|
|||
|
|
@ -100,7 +100,7 @@ if (onnxruntime_BUILD_UNIT_TESTS)
|
|||
endif()
|
||||
|
||||
if (onnxruntime_USE_ROCM)
|
||||
target_compile_options(onnxruntime_training_runner PUBLIC -D__HIP_PLATFORM_HCC__=1)
|
||||
target_compile_options(onnxruntime_training_runner PUBLIC -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
|
||||
endif()
|
||||
|
||||
set_target_properties(onnxruntime_training_runner PROPERTIES FOLDER "ONNXRuntimeTest")
|
||||
|
|
|
|||
|
|
@ -586,12 +586,6 @@ if(onnxruntime_USE_XNNPACK)
|
|||
list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_xnnpack)
|
||||
endif()
|
||||
|
||||
if(onnxruntime_USE_ROCM)
|
||||
find_library(HIP_LIB amdhip64 REQUIRED)
|
||||
list(APPEND onnxruntime_test_providers_libs ${HIP_LIB})
|
||||
endif()
|
||||
|
||||
|
||||
if(WIN32)
|
||||
if (onnxruntime_USE_TVM)
|
||||
list(APPEND disabled_warnings ${DISABLED_WARNINGS_FOR_TVM})
|
||||
|
|
@ -754,7 +748,7 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
|
|||
target_link_libraries(onnxruntime_test_all PRIVATE onnxruntime_language_interop onnxruntime_pyop)
|
||||
endif()
|
||||
if (onnxruntime_USE_ROCM)
|
||||
target_compile_options(onnxruntime_test_all PRIVATE -D__HIP_PLATFORM_HCC__=1)
|
||||
target_compile_options(onnxruntime_test_all PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
|
||||
target_include_directories(onnxruntime_test_all PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
|
||||
endif()
|
||||
if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 3e1174ec..65648cb7 100644
|
||||
index 5655ba17..1252d1ea 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -1,7 +1,7 @@
|
||||
|
|
@ -11,7 +11,7 @@ index 3e1174ec..65648cb7 100644
|
|||
|
||||
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
|
||||
|
||||
@@ -36,27 +36,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
@@ -41,27 +41,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
|
||||
|
||||
|
|
@ -39,7 +39,16 @@ index 3e1174ec..65648cb7 100644
|
|||
## HIP
|
||||
find_package(HIP REQUIRED)
|
||||
# Override HIP version in config.h, if necessary.
|
||||
@@ -245,9 +224,6 @@ rocm_package_setup_component(tests
|
||||
@@ -83,8 +62,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
|
||||
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
|
||||
endif()
|
||||
message(STATUS "Build with HIP ${HIP_VERSION}")
|
||||
-link_libraries(hip::device)
|
||||
-add_compile_definitions(__HIP_PLATFORM_HCC__=1)
|
||||
|
||||
## tidy
|
||||
include(EnableCompilerWarnings)
|
||||
@@ -263,9 +240,6 @@ rocm_package_setup_component(tests
|
||||
)
|
||||
|
||||
add_subdirectory(library)
|
||||
|
|
@ -49,7 +58,7 @@ index 3e1174ec..65648cb7 100644
|
|||
|
||||
#Create an interface target for the include only files and call it "composablekernels"
|
||||
include(CMakePackageConfigHelpers)
|
||||
@@ -273,11 +249,3 @@ rocm_install(FILES
|
||||
@@ -291,11 +265,3 @@ rocm_install(FILES
|
||||
|
||||
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
|
||||
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
|
||||
|
|
@ -62,7 +71,7 @@ index 3e1174ec..65648cb7 100644
|
|||
- HEADER_ONLY
|
||||
-)
|
||||
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
|
||||
index fcaec592..8ea06421 100644
|
||||
index 92018aac..2ada620c 100644
|
||||
--- a/include/ck/ck.hpp
|
||||
+++ b/include/ck/ck.hpp
|
||||
@@ -126,7 +126,9 @@
|
||||
|
|
@ -76,23 +85,16 @@ index fcaec592..8ea06421 100644
|
|||
|
||||
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
|
||||
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
||||
index 6f3f900b..594d983d 100644
|
||||
index c206c4dc..e45fac9d 100644
|
||||
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
||||
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
|
||||
@@ -1,5 +1,6 @@
|
||||
@@ -1,7 +1,9 @@
|
||||
function(add_instance_library INSTANCE_NAME)
|
||||
message("adding instance ${INSTANCE_NAME}")
|
||||
+ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
|
||||
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
|
||||
target_compile_features(${INSTANCE_NAME} PUBLIC)
|
||||
+ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1")
|
||||
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
|
||||
index 5dc20332..78eedca5 100644
|
||||
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
|
||||
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
|
||||
@@ -1,4 +1,4 @@
|
||||
-add_library(device_grouped_conv3d_fwd_instance
|
||||
+add_instance_library(device_grouped_conv3d_fwd_instance
|
||||
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
|
||||
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
|
||||
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
|
||||
clang_tidy_check(${INSTANCE_NAME})
|
||||
endfunction(add_instance_library INSTANCE_NAME)
|
||||
|
|
|
|||
Loading…
Reference in a new issue