diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index b06d5533e4..3a46e14b38 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1869,6 +1869,9 @@ if (onnxruntime_USE_ROCM) message(FATAL_ERROR "ROCM does not support build with CUDA!") endif() + # NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1 + cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) + set(ROCM_PATH ${onnxruntime_ROCM_HOME}) if (NOT CMAKE_HIP_COMPILER) @@ -1879,15 +1882,24 @@ if (onnxruntime_USE_ROCM) set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a;gfx1030") endif() + file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*) + list(APPEND CMAKE_PREFIX_PATH ${rocm_cmake_components}) + + enable_language(HIP) + # NOTE: Flags -mllvm -amdgpu-early-inline-all=true are critical for gpu kernel code performance. -mllvm passes the + # next flag to underlying LLVM instead of clang and -amdgpu-early-inline-all=true is the optimization flag for LLVM. + # With CMake's enable_language(HIP), additional flags including the proceeding one are propagated from + # hip-lang::device library. But in some weird cases, the hip-lang::device target may not be properly configured, for + # example, the CMAKE_PREFIX_PATH might be improperly configured. + if(NOT DEFINED _CMAKE_HIP_DEVICE_RUNTIME_TARGET) + message(FATAL_ERROR "HIP Language is not properly configured.") + endif() + add_compile_options("$<$:SHELL:-x hip>") + if (NOT onnxruntime_HIPIFY_PERL) set(onnxruntime_HIPIFY_PERL ${onnxruntime_ROCM_HOME}/hip/bin/hipify-perl) endif() - # NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1 - cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) - find_package(HIP) - enable_language(HIP) - # replicate strategy used by pytorch to get ROCM_VERSION # https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173 file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW) diff --git a/cmake/external/composable_kernel.cmake b/cmake/external/composable_kernel.cmake index defebb6ae4..fc20520b83 100644 --- a/cmake/external/composable_kernel.cmake +++ b/cmake/external/composable_kernel.cmake @@ -1,5 +1,5 @@ set(composable_kernel_URL https://github.com/ROCmSoftwarePlatform/composable_kernel.git) -set(composable_kernel_TAG e1a3fff67510be2af023b31587e411230b994631) # 2022-08-25 07:43:43 +0800 +set(composable_kernel_TAG 8ee36118be9b19b15c2471bffeeeb624afb14044) # 2022-11-01 00:24:25 +0800 set(PATCH ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Fix_Clang_Build.patch) diff --git a/cmake/onnxruntime_kernel_explorer.cmake b/cmake/onnxruntime_kernel_explorer.cmake index 87fff40c79..35fe476792 100644 --- a/cmake/onnxruntime_kernel_explorer.cmake +++ b/cmake/onnxruntime_kernel_explorer.cmake @@ -40,12 +40,11 @@ target_link_libraries(kernel_explorer # Currently we shall not use composablekernels::device_operations, the target includes all conv dependencies, which # are extremely slow to compile. Instead, we only link all gemm related objects. See the following link on updating. # https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/85978e0201/library/src/tensor_operation_instance/gpu/CMakeLists.txt#L33-L54 - device_gemm_instance - ${HIP_LIB}) + device_gemm_instance) target_compile_definitions(kernel_explorer PUBLIC ROCM_USE_FLOAT16 PRIVATE $) -target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1) +target_compile_options(kernel_explorer PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1) add_dependencies(kernel_explorer onnxruntime_pybind11_state) diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index d6088d2215..4c123912dd 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -1264,18 +1264,15 @@ if (onnxruntime_USE_ROCM) add_definitions(-DUSE_ROCM=1) include(onnxruntime_rocm_hipify.cmake) - # Add search paths for default hip installation - list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME} ${onnxruntime_ROCM_HOME}/hip ${onnxruntime_ROCM_HOME}/hcc ${onnxruntime_ROCM_HOME}/miopen ${onnxruntime_ROCM_HOME}/hiprand ${onnxruntime_ROCM_HOME}/rocrand) + list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME}/rccl ${onnxruntime_ROCM_HOME}/roctracer) - set(CMAKE_MODULE_PATH "${onnxruntime_ROCM_HOME}/hip/cmake" ${CMAKE_MODULE_PATH}) find_package(HIP) find_package(hiprand REQUIRED) - find_library(HIP_LIB amdhip64 REQUIRED) - find_library(ROC_BLAS rocblas REQUIRED) - find_library(MIOPEN_LIB MIOpen REQUIRED) + find_package(rocblas REQUIRED) + find_package(MIOpen REQUIRED) find_library(RCCL_LIB rccl REQUIRED) find_library(ROCTRACER_LIB roctracer64 REQUIRED) - set(ONNXRUNTIME_ROCM_LIBS ${HIP_LIB} ${ROC_BLAS} ${MIOPEN_LIB} ${RCCL_LIB} ${ROCTRACER_LIB}) + set(ONNXRUNTIME_ROCM_LIBS roc::rocblas MIOpen ${RCCL_LIB} ${ROCTRACER_LIB}) file(GLOB_RECURSE onnxruntime_providers_rocm_cc_srcs CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/core/providers/rocm/*.h" @@ -1342,30 +1339,12 @@ if (onnxruntime_USE_ROCM) list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_generated_training_ops_cc_srcs} ${onnxruntime_rocm_generated_training_ops_cu_srcs}) endif() - set(HIP_CXX_FLAGS -fPIC) - list(APPEND HIP_CXX_FLAGS -std=c++17) - - if(CMAKE_BUILD_TYPE MATCHES Debug) - list(APPEND HIP_CXX_FLAGS -g) - #list(APPEND HIP_CXX_FLAGS -O0) - endif(CMAKE_BUILD_TYPE MATCHES Debug) - - list(APPEND HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) - list(APPEND HIP_CLANG_FLAGS ${CMAKE_HIP_FLAGS}) - - # Generate GPU code during compilation - list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) - - # Generate GPU code - foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) - list(APPEND HIP_CLANG_FLAGS --offload-arch=${HIP_ARCH}) - endforeach() - auto_set_source_files_hip_language(${onnxruntime_providers_rocm_src}) onnxruntime_add_shared_library_module(onnxruntime_providers_rocm ${onnxruntime_providers_rocm_src}) + target_compile_options(onnxruntime_providers_rocm PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1) if(NOT MSVC) - target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1) + target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-sign-compare) target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-unused-parameter) target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-undefined-var-template) endif() diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 4a77e462f9..ee41f7fe46 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -117,7 +117,7 @@ if(onnxruntime_USE_CANN) target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include) endif() if(onnxruntime_USE_ROCM) - target_compile_options(onnxruntime_pybind11_state PUBLIC -D__HIP_PLATFORM_HCC__=1) + target_compile_options(onnxruntime_pybind11_state PUBLIC -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1) target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining) endif() if (onnxruntime_USE_NCCL) diff --git a/cmake/onnxruntime_session.cmake b/cmake/onnxruntime_session.cmake index 83129616c8..de285dc8e3 100644 --- a/cmake/onnxruntime_session.cmake +++ b/cmake/onnxruntime_session.cmake @@ -47,7 +47,7 @@ if (onnxruntime_USE_CUDA) target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) endif() if (onnxruntime_USE_ROCM) - target_compile_options(onnxruntime_session PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_HCC__=1) + target_compile_options(onnxruntime_session PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1) target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hipcub/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include) # ROCM provider sources are generated, need to add include directory for generated headers target_include_directories(onnxruntime_session PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining) diff --git a/cmake/onnxruntime_training.cmake b/cmake/onnxruntime_training.cmake index 2a43f9656f..7b61779d6a 100644 --- a/cmake/onnxruntime_training.cmake +++ b/cmake/onnxruntime_training.cmake @@ -100,7 +100,7 @@ if (onnxruntime_BUILD_UNIT_TESTS) endif() if (onnxruntime_USE_ROCM) - target_compile_options(onnxruntime_training_runner PUBLIC -D__HIP_PLATFORM_HCC__=1) + target_compile_options(onnxruntime_training_runner PUBLIC -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1) endif() set_target_properties(onnxruntime_training_runner PROPERTIES FOLDER "ONNXRuntimeTest") diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 3f82ba9df8..fb52144142 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -586,12 +586,6 @@ if(onnxruntime_USE_XNNPACK) list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_xnnpack) endif() -if(onnxruntime_USE_ROCM) - find_library(HIP_LIB amdhip64 REQUIRED) - list(APPEND onnxruntime_test_providers_libs ${HIP_LIB}) -endif() - - if(WIN32) if (onnxruntime_USE_TVM) list(APPEND disabled_warnings ${DISABLED_WARNINGS_FOR_TVM}) @@ -754,7 +748,7 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS) target_link_libraries(onnxruntime_test_all PRIVATE onnxruntime_language_interop onnxruntime_pyop) endif() if (onnxruntime_USE_ROCM) - target_compile_options(onnxruntime_test_all PRIVATE -D__HIP_PLATFORM_HCC__=1) + target_compile_options(onnxruntime_test_all PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1) target_include_directories(onnxruntime_test_all PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining) endif() if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) diff --git a/cmake/patches/composable_kernel/Fix_Clang_Build.patch b/cmake/patches/composable_kernel/Fix_Clang_Build.patch index d4f0b3e996..937a739c7b 100644 --- a/cmake/patches/composable_kernel/Fix_Clang_Build.patch +++ b/cmake/patches/composable_kernel/Fix_Clang_Build.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 3e1174ec..65648cb7 100644 +index 5655ba17..1252d1ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ @@ -11,7 +11,7 @@ index 3e1174ec..65648cb7 100644 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") -@@ -36,27 +36,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) +@@ -41,27 +41,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") @@ -39,7 +39,16 @@ index 3e1174ec..65648cb7 100644 ## HIP find_package(HIP REQUIRED) # Override HIP version in config.h, if necessary. -@@ -245,9 +224,6 @@ rocm_package_setup_component(tests +@@ -83,8 +62,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH ) + message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}") + endif() + message(STATUS "Build with HIP ${HIP_VERSION}") +-link_libraries(hip::device) +-add_compile_definitions(__HIP_PLATFORM_HCC__=1) + + ## tidy + include(EnableCompilerWarnings) +@@ -263,9 +240,6 @@ rocm_package_setup_component(tests ) add_subdirectory(library) @@ -49,7 +58,7 @@ index 3e1174ec..65648cb7 100644 #Create an interface target for the include only files and call it "composablekernels" include(CMakePackageConfigHelpers) -@@ -273,11 +249,3 @@ rocm_install(FILES +@@ -291,11 +265,3 @@ rocm_install(FILES set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") set(CPACK_RPM_PACKAGE_LICENSE "MIT") @@ -62,7 +71,7 @@ index 3e1174ec..65648cb7 100644 - HEADER_ONLY -) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp -index fcaec592..8ea06421 100644 +index 92018aac..2ada620c 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -126,7 +126,9 @@ @@ -76,23 +85,16 @@ index fcaec592..8ea06421 100644 // hack: have underlying assumption that need to be satsified, otherwise it's a bug diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt -index 6f3f900b..594d983d 100644 +index c206c4dc..e45fac9d 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt -@@ -1,5 +1,6 @@ +@@ -1,7 +1,9 @@ function(add_instance_library INSTANCE_NAME) message("adding instance ${INSTANCE_NAME}") + set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) add_library(${INSTANCE_NAME} OBJECT ${ARGN}) target_compile_features(${INSTANCE_NAME} PUBLIC) ++ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1") set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) -diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt -index 5dc20332..78eedca5 100644 ---- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt -+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt -@@ -1,4 +1,4 @@ --add_library(device_grouped_conv3d_fwd_instance -+add_instance_library(device_grouped_conv3d_fwd_instance - device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp - device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp - device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp + clang_tidy_check(${INSTANCE_NAME}) + endfunction(add_instance_library INSTANCE_NAME)