onnxruntime/cmake/onnxruntime_providers_rocm.cmake
Changming Sun 88676e62b9
Remove nsync (#20413)
### Description
1. Remove the onnxruntime::OrtMutex class and replace it with
~absl::Mutex~ std::mutex.
2. After this change, most source files will not include <Windows.h>
indirectly.


### Motivation and Context
To reduce the number of deps we have, and address some Github issues
that are related to build ONNX Runtime from source.
In PR #3000 , I added a custom implementation of std::mutex . It was
mainly because at that time std::mutex's default constructor was not
trivial on Windows. If you had such a mutex as a global var, it could
not be initialized at compile time. Then VC++ team fixed this issue.
Therefore we don't need this custom implementation anymore.

This PR also removes nsync. I ran several models tests on Linux. I
didn't see any perf difference.
This PR also reverts PR #21005 , which is no longer needed since conda
has updated its msvc runtime DLL.

This PR unblocks #22173 and resolves #22092 . We have a lot of open
issues with nsync. This PR can resolve all of them.
2024-10-21 15:32:14 -07:00

236 lines
12 KiB
CMake

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
add_definitions(-DUSE_ROCM=1)
include(onnxruntime_rocm_hipify.cmake)
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_ROCM_HOME})
find_package(HIP)
find_package(hiprand REQUIRED)
find_package(hipblas REQUIRED)
find_package(MIOpen REQUIRED)
find_package(hipfft REQUIRED)
# MIOpen version
if(NOT DEFINED ENV{MIOPEN_PATH})
set(MIOPEN_PATH ${onnxruntime_ROCM_HOME})
else()
set(MIOPEN_PATH $ENV{MIOPEN_PATH})
endif()
find_path(MIOPEN_VERSION_H_PATH
NAMES version.h
HINTS
${MIOPEN_PATH}/include/miopen
${MIOPEN_PATH}/miopen/include)
if (MIOPEN_VERSION_H_PATH-NOTFOUND)
MESSAGE(FATAL_ERROR "miopen version.h not found")
endif()
MESSAGE(STATUS "Found miopen version.h at ${MIOPEN_VERSION_H_PATH}")
file(READ ${MIOPEN_VERSION_H_PATH}/version.h MIOPEN_HEADER_CONTENTS)
string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
set(MIOPEN_VERSION_DEV "${MIOPEN_VERSION_MAJOR}.${MIOPEN_VERSION_MINOR}.${MIOPEN_VERSION_PATCH}")
math(EXPR MIOPEN_VERSION_DEV_INT "(${MIOPEN_VERSION_MAJOR}*10000) + (${MIOPEN_VERSION_MINOR}*100) + ${MIOPEN_VERSION_PATCH}")
message("MIOPEN_VERSION_DEV: ${MIOPEN_VERSION_DEV}")
message("MIOPEN_VERSION_DEV_INT: ${MIOPEN_VERSION_DEV_INT}")
add_definitions(-DMIOPEN_VERSION=${MIOPEN_VERSION_DEV_INT})
find_library(RCCL_LIB rccl REQUIRED)
find_library(ROCTRACER_LIB roctracer64 REQUIRED)
find_package(rocm_smi REQUIRED)
set(ONNXRUNTIME_ROCM_LIBS roc::hipblas MIOpen hip::hipfft ${ROCM_SMI_LIBRARY} ${RCCL_LIB} ${ROCTRACER_LIB})
include_directories(${ROCM_SMI_INCLUDE_DIR})
link_directories(${ROCM_SMI_LIB_DIR})
file(GLOB_RECURSE onnxruntime_providers_rocm_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/rocm/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/rocm/*.cc"
)
# The shared_library files are in a separate list since they use precompiled headers, and the above files have them disabled.
file(GLOB_RECURSE onnxruntime_providers_rocm_shared_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
)
file(GLOB_RECURSE onnxruntime_providers_rocm_cu_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/rocm/*.cu"
"${ONNXRUNTIME_ROOT}/core/providers/rocm/*.cuh"
)
hipify("onnxruntime/core/providers" provider_excluded_files onnxruntime_providers_rocm_generated_cc_srcs onnxruntime_providers_rocm_generated_cu_srcs)
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_rocm_cc_srcs} ${onnxruntime_providers_rocm_shared_srcs} ${onnxruntime_providers_rocm_cu_srcs})
set(onnxruntime_providers_rocm_src ${onnxruntime_providers_rocm_cc_srcs} ${onnxruntime_providers_rocm_shared_srcs} ${onnxruntime_providers_rocm_cu_srcs})
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_providers_rocm_generated_cc_srcs} ${onnxruntime_providers_rocm_generated_cu_srcs})
# disable contrib ops conditionally
if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
hipify("onnxruntime/contrib_ops" contrib_ops_excluded_files onnxruntime_rocm_generated_contrib_ops_cc_srcs onnxruntime_rocm_generated_contrib_ops_cu_srcs)
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_rocm_contrib_ops_cc_srcs} ${onnxruntime_rocm_contrib_ops_cu_srcs})
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_contrib_ops_cc_srcs} ${onnxruntime_rocm_contrib_ops_cu_srcs})
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_generated_contrib_ops_cc_srcs} ${onnxruntime_rocm_generated_contrib_ops_cu_srcs})
endif()
if (onnxruntime_ENABLE_TRAINING_OPS)
file(GLOB_RECURSE onnxruntime_rocm_training_ops_cc_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/training_ops/rocm/*.h"
"${ORTTRAINING_SOURCE_DIR}/training_ops/rocm/*.cc"
)
file(GLOB_RECURSE onnxruntime_rocm_training_ops_cu_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/training_ops/rocm/*.cu"
"${ORTTRAINING_SOURCE_DIR}/training_ops/rocm/*.cuh"
)
hipify("orttraining/orttraining/training_ops" training_ops_excluded_files onnxruntime_rocm_generated_training_ops_cc_srcs onnxruntime_rocm_generated_training_ops_cu_srcs)
# NCCL is not support in Windows build
if (WIN32 OR NOT onnxruntime_USE_NCCL)
list(REMOVE_ITEM onnxruntime_rocm_generated_training_ops_cc_srcs
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/collective/nccl_common.cc"
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/collective/nccl_kernels.cc"
"${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining/orttraining/training_ops/rocm/collective/megatron.cc"
)
endif()
source_group(TREE ${ORTTRAINING_ROOT} FILES ${onnxruntime_rocm_training_ops_cc_srcs} ${onnxruntime_rocm_training_ops_cu_srcs})
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_training_ops_cc_srcs} ${onnxruntime_rocm_training_ops_cu_srcs})
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_generated_training_ops_cc_srcs} ${onnxruntime_rocm_generated_training_ops_cu_srcs})
endif()
auto_set_source_files_hip_language(${onnxruntime_providers_rocm_src})
onnxruntime_add_shared_library_module(onnxruntime_providers_rocm ${onnxruntime_providers_rocm_src})
target_compile_options(onnxruntime_providers_rocm PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
target_link_options(onnxruntime_providers_rocm PRIVATE -T ${REPO_ROOT}/cmake/hip_fatbin_insert)
if(NOT MSVC)
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-sign-compare)
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-unused-parameter)
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-undefined-var-template)
endif()
onnxruntime_add_include_to_target(onnxruntime_providers_rocm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface)
if (onnxruntime_ENABLE_TRAINING_OPS)
onnxruntime_add_include_to_target(onnxruntime_providers_rocm onnxruntime_training)
target_link_libraries(onnxruntime_providers_rocm PRIVATE onnxruntime_training)
if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
onnxruntime_add_include_to_target(onnxruntime_providers_rocm Python::Module)
endif()
endif()
add_custom_target(generate_hipified_files DEPENDS
${onnxruntime_providers_rocm_generated_cc_srcs}
${onnxruntime_providers_rocm_generated_cu_srcs}
${onnxruntime_rocm_generated_contrib_ops_cc_srcs}
${onnxruntime_rocm_generated_contrib_ops_cu_srcs}
${onnxruntime_rocm_generated_training_ops_cc_srcs}
${onnxruntime_rocm_generated_training_ops_cu_srcs})
add_dependencies(onnxruntime_providers_rocm generate_hipified_files onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
target_link_libraries(onnxruntime_providers_rocm PRIVATE ${ONNXRUNTIME_ROCM_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS})
target_include_directories(onnxruntime_providers_rocm SYSTEM
PRIVATE
${ONNXRUNTIME_ROOT}
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime
${eigen_INCLUDE_DIRS}
PUBLIC
${onnxruntime_ROCM_HOME}/include
${onnxruntime_ROCM_HOME}/include/roctracer)
set_target_properties(onnxruntime_providers_rocm PROPERTIES LINKER_LANGUAGE CXX)
set_target_properties(onnxruntime_providers_rocm PROPERTIES FOLDER "ONNXRuntime")
target_compile_definitions(onnxruntime_providers_rocm PRIVATE HIPBLAS_V2)
if (onnxruntime_ENABLE_TRAINING)
target_include_directories(onnxruntime_providers_rocm PRIVATE ${ORTTRAINING_ROOT} ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining ${MPI_CXX_INCLUDE_DIRS})
if(onnxruntime_USE_MPI)
target_link_libraries(onnxruntime_providers_rocm PRIVATE ${MPI_LIBRARIES} ${MPI_CXX_LINK_FLAGS})
endif()
# RCCL is enabled by default for ROCM builds
#if (onnxruntime_USE_NCCL)
# target_include_directories(onnxruntime_providers_rocm PRIVATE ${NCCL_INCLUDE_DIRS})
# target_link_libraries(onnxruntime_providers_rocm PRIVATE ${NCCL_LIBRARIES})
#endif()
endif()
if (onnxruntime_USE_ROCBLAS_EXTENSION_API)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE USE_ROCBLAS_EXTENSION_API)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE ROCBLAS_NO_DEPRECATED_WARNINGS)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE ROCBLAS_BETA_FEATURES_API)
endif()
if (onnxruntime_USE_HIPBLASLT)
find_package(hipblaslt REQUIRED)
target_link_libraries(onnxruntime_providers_rocm PRIVATE roc::hipblaslt)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE USE_HIPBLASLT)
endif()
if (onnxruntime_USE_TRITON_KERNEL)
# compile triton kernel, generate .a and .h files
include(onnxruntime_compile_triton_kernel.cmake)
compile_triton_kernel(triton_kernel_obj_file triton_kernel_header_dir)
add_dependencies(onnxruntime_providers_rocm onnxruntime_triton_kernel)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE USE_TRITON_KERNEL)
target_include_directories(onnxruntime_providers_rocm PRIVATE ${triton_kernel_header_dir})
target_link_libraries(onnxruntime_providers_rocm PUBLIC -Wl,--whole-archive ${triton_kernel_obj_file} -Wl,--no-whole-archive)
endif()
if (onnxruntime_USE_COMPOSABLE_KERNEL)
include(composable_kernel)
target_link_libraries(onnxruntime_providers_rocm PRIVATE
onnxruntime_composable_kernel_includes
# Currently we shall not use composablekernels::device_operations, the target includes all conv dependencies, which
# are extremely slow to compile. Instead, we only link all gemm related objects. See the following directory on
# updating.
# https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/library/src/tensor_operation_instance/gpu
device_gemm_instance
device_gemm_add_fastgelu_instance
device_gemm_fastgelu_instance
device_gemm_splitk_instance
device_gemm_streamk_instance
device_batched_gemm_instance
device_softmax_instance
)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE USE_COMPOSABLE_KERNEL)
if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE)
target_link_libraries(onnxruntime_providers_rocm PUBLIC onnxruntime_composable_kernel_fmha)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE USE_COMPOSABLE_KERNEL_CK_TILE)
endif()
endif()
if(UNIX)
set_property(TARGET onnxruntime_providers_rocm APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/rocm/version_script.lds -Xlinker --gc-sections")
else()
message(FATAL_ERROR "onnxruntime_providers_rocm unknown platform, need to specify shared library exports for it")
endif()
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE ENABLE_ATEN)
endif()
file(GLOB ONNXRUNTIME_ROCM_PROVIDER_PUBLIC_HEADERS CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/rocm/*.h"
)
set_target_properties(onnxruntime_providers_rocm PROPERTIES
PUBLIC_HEADER "${ONNXRUNTIME_ROCM_PROVIDER_PUBLIC_HEADERS}")
install(TARGETS onnxruntime_providers_rocm
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers/rocm
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})