2022-11-23 23:23:24 +00:00
|
|
|
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
|
2024-05-28 22:33:53 +00:00
|
|
|
# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
|
2016-12-05 00:42:00 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
|
|
|
|
|
# sometimes makes XCode C compiler gets detected as "Clang", even when the C++
|
|
|
|
|
# one is detected as "AppleClang".
|
2019-08-20 08:23:37 +00:00
|
|
|
cmake_policy(SET CMP0010 NEW)
|
2019-02-04 16:50:35 +00:00
|
|
|
cmake_policy(SET CMP0025 NEW)
|
|
|
|
|
|
2023-02-01 17:06:53 +00:00
|
|
|
# Enables CMake to set LTO on compilers other than Intel.
|
|
|
|
|
cmake_policy(SET CMP0069 NEW)
|
2024-05-28 22:33:53 +00:00
|
|
|
# Enable the policy for CMake subprojects. protobuf currently causes issues
|
|
|
|
|
# set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
|
2023-02-01 17:06:53 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# Suppress warning flags in default MSVC configuration. It's not mandatory that
|
|
|
|
|
# we do this (and we don't if cmake is old), but it's nice when it's possible,
|
|
|
|
|
# and it's possible on our Windows configs.
|
2022-11-23 23:23:24 +00:00
|
|
|
cmake_policy(SET CMP0092 NEW)
|
2019-08-30 14:09:30 +00:00
|
|
|
|
2024-03-18 21:48:14 +00:00
|
|
|
# Prohibit in-source builds
|
|
|
|
|
if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
|
2024-05-28 22:33:53 +00:00
|
|
|
message(FATAL_ERROR "In-source build are not supported")
|
2024-03-18 21:48:14 +00:00
|
|
|
endif()
|
|
|
|
|
|
2017-09-26 15:45:37 +00:00
|
|
|
# ---[ Project and semantic versioning.
|
2020-05-15 19:22:13 +00:00
|
|
|
project(Torch CXX C)
|
2016-12-05 00:42:00 +00:00
|
|
|
|
2020-03-25 20:43:00 +00:00
|
|
|
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
|
2019-08-09 15:10:22 +00:00
|
|
|
set(LINUX TRUE)
|
|
|
|
|
else()
|
|
|
|
|
set(LINUX FALSE)
|
|
|
|
|
endif()
|
|
|
|
|
|
2018-10-05 22:55:01 +00:00
|
|
|
set(CMAKE_INSTALL_MESSAGE NEVER)
|
|
|
|
|
|
2021-01-15 15:11:04 +00:00
|
|
|
# check and set CMAKE_CXX_STANDARD
|
|
|
|
|
string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
|
|
|
|
|
if(env_cxx_standard GREATER -1)
|
|
|
|
|
message(
|
2024-05-28 22:33:53 +00:00
|
|
|
WARNING
|
|
|
|
|
"C++ standard version definition detected in environment variable."
|
|
|
|
|
"PyTorch requires -std=c++17. Please remove -std=c++ settings in your environment."
|
|
|
|
|
)
|
2021-01-15 15:11:04 +00:00
|
|
|
endif()
|
2024-05-28 22:33:53 +00:00
|
|
|
set(CMAKE_CXX_STANDARD
|
|
|
|
|
17
|
|
|
|
|
CACHE STRING
|
|
|
|
|
"The C++ standard whose features are requested to build this target.")
|
|
|
|
|
set(CMAKE_C_STANDARD
|
|
|
|
|
11
|
|
|
|
|
CACHE STRING
|
|
|
|
|
"The C standard whose features are requested to build this target.")
|
2021-01-15 15:11:04 +00:00
|
|
|
|
remove abi uncertainty and potential abi conflict (#94306)
Currently there is a potential conflict for `GLIBCXX_USE_CXX11_ABI` configuration if users don't explicitly set this variable.
In `caffe2/CMakeLists.txt`, if the variable is not set, an `abi checker` will be used to retrieve the ABI configuration from compiler.
https://github.com/pytorch/pytorch/blob/master/caffe2/CMakeLists.txt#L1165-L1183
However, in 'torch/csrc/Module.cpp`, if the variable is not set, it will be set to `0`. The conflict happens when the default ABI of the compiler is `1`.
https://github.com/pytorch/pytorch/blob/master/torch/csrc/Module.cpp#L1612
This PR eliminate this uncertainty and potential conflict.
The ABI will be checked and set in `CMakeLists.txt`, and pass the value to `caffe2/CMakeLists.txt`. Meanwhile, in case the `caffe2/CMakeLists.txt` is directly invoked from a `cmake` command, The original GLIBC check logic is kept in this file.
If users doesn't explicitly assign a value to `GLIBCXX_USE_CXX11_ABI`, the `abi checker` will be executed and set the value accordingly. If the `abi checker` failed to compile or execute, the value will be set to `0`. If users explicitly assigned a value, then the provided value will be used.
Moreover, if `GLIBCXX_USE_CXX11_ABI` is set to `0`, the '-DGLIBCXX_USE_CXX11_ABI=0' flag won't be appended to `CMAKE_CXX_FLAGS`. Thus, whether to use ABI=0 or ABI=1 fully depends on compiler's default configuration. It could cause an issue that even users explicitly set `GLIBCXX_USE_CXX11_ABI` to `0`, the compiler still builds the binaries with ABI=1.
https://github.com/pytorch/pytorch/blob/master/CMakeLists.txt#L44-L51
Pull Request resolved: https://github.com/pytorch/pytorch/pull/94306
Approved by: https://github.com/malfet
2023-02-09 09:54:04 +00:00
|
|
|
# ---[ Utils
|
|
|
|
|
include(cmake/public/utils.cmake)
|
|
|
|
|
|
2024-02-23 20:31:17 +00:00
|
|
|
# --- [ Check that minimal gcc version is 9.3+
|
|
|
|
|
if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3)
|
2024-05-28 22:33:53 +00:00
|
|
|
message(
|
|
|
|
|
FATAL_ERROR
|
|
|
|
|
"GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
|
|
|
|
|
)
|
2023-11-06 17:19:53 +00:00
|
|
|
endif()
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# This define is needed to preserve behavior given anticpated changes to
|
|
|
|
|
# cccl/thrust
|
2024-01-30 06:11:26 +00:00
|
|
|
# https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html
|
2024-05-28 22:33:53 +00:00
|
|
|
string(APPEND CMAKE_CUDA_FLAGS
|
|
|
|
|
" -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS")
|
2024-01-30 06:11:26 +00:00
|
|
|
|
2023-11-06 17:19:53 +00:00
|
|
|
if(LINUX)
|
remove abi uncertainty and potential abi conflict (#94306)
Currently there is a potential conflict for `GLIBCXX_USE_CXX11_ABI` configuration if users don't explicitly set this variable.
In `caffe2/CMakeLists.txt`, if the variable is not set, an `abi checker` will be used to retrieve the ABI configuration from compiler.
https://github.com/pytorch/pytorch/blob/master/caffe2/CMakeLists.txt#L1165-L1183
However, in 'torch/csrc/Module.cpp`, if the variable is not set, it will be set to `0`. The conflict happens when the default ABI of the compiler is `1`.
https://github.com/pytorch/pytorch/blob/master/torch/csrc/Module.cpp#L1612
This PR eliminate this uncertainty and potential conflict.
The ABI will be checked and set in `CMakeLists.txt`, and pass the value to `caffe2/CMakeLists.txt`. Meanwhile, in case the `caffe2/CMakeLists.txt` is directly invoked from a `cmake` command, The original GLIBC check logic is kept in this file.
If users doesn't explicitly assign a value to `GLIBCXX_USE_CXX11_ABI`, the `abi checker` will be executed and set the value accordingly. If the `abi checker` failed to compile or execute, the value will be set to `0`. If users explicitly assigned a value, then the provided value will be used.
Moreover, if `GLIBCXX_USE_CXX11_ABI` is set to `0`, the '-DGLIBCXX_USE_CXX11_ABI=0' flag won't be appended to `CMAKE_CXX_FLAGS`. Thus, whether to use ABI=0 or ABI=1 fully depends on compiler's default configuration. It could cause an issue that even users explicitly set `GLIBCXX_USE_CXX11_ABI` to `0`, the compiler still builds the binaries with ABI=1.
https://github.com/pytorch/pytorch/blob/master/CMakeLists.txt#L44-L51
Pull Request resolved: https://github.com/pytorch/pytorch/pull/94306
Approved by: https://github.com/malfet
2023-02-09 09:54:04 +00:00
|
|
|
include(cmake/CheckAbi.cmake)
|
2024-05-28 22:33:53 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS
|
|
|
|
|
" -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
|
|
|
|
|
string(APPEND CMAKE_CUDA_FLAGS
|
|
|
|
|
" -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
|
2020-03-25 20:43:00 +00:00
|
|
|
if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
|
2019-05-16 16:37:02 +00:00
|
|
|
set(CXX_STANDARD_REQUIRED ON)
|
2023-02-07 04:49:06 +00:00
|
|
|
else()
|
2024-05-28 22:33:53 +00:00
|
|
|
# Please note this is required in order to ensure compatibility between gcc
|
|
|
|
|
# 9 and gcc 7 This could be removed when all Linux PyTorch binary builds are
|
|
|
|
|
# compiled by the same toolchain again
|
remove abi uncertainty and potential abi conflict (#94306)
Currently there is a potential conflict for `GLIBCXX_USE_CXX11_ABI` configuration if users don't explicitly set this variable.
In `caffe2/CMakeLists.txt`, if the variable is not set, an `abi checker` will be used to retrieve the ABI configuration from compiler.
https://github.com/pytorch/pytorch/blob/master/caffe2/CMakeLists.txt#L1165-L1183
However, in 'torch/csrc/Module.cpp`, if the variable is not set, it will be set to `0`. The conflict happens when the default ABI of the compiler is `1`.
https://github.com/pytorch/pytorch/blob/master/torch/csrc/Module.cpp#L1612
This PR eliminate this uncertainty and potential conflict.
The ABI will be checked and set in `CMakeLists.txt`, and pass the value to `caffe2/CMakeLists.txt`. Meanwhile, in case the `caffe2/CMakeLists.txt` is directly invoked from a `cmake` command, The original GLIBC check logic is kept in this file.
If users doesn't explicitly assign a value to `GLIBCXX_USE_CXX11_ABI`, the `abi checker` will be executed and set the value accordingly. If the `abi checker` failed to compile or execute, the value will be set to `0`. If users explicitly assigned a value, then the provided value will be used.
Moreover, if `GLIBCXX_USE_CXX11_ABI` is set to `0`, the '-DGLIBCXX_USE_CXX11_ABI=0' flag won't be appended to `CMAKE_CXX_FLAGS`. Thus, whether to use ABI=0 or ABI=1 fully depends on compiler's default configuration. It could cause an issue that even users explicitly set `GLIBCXX_USE_CXX11_ABI` to `0`, the compiler still builds the binaries with ABI=1.
https://github.com/pytorch/pytorch/blob/master/CMakeLists.txt#L44-L51
Pull Request resolved: https://github.com/pytorch/pytorch/pull/94306
Approved by: https://github.com/malfet
2023-02-09 09:54:04 +00:00
|
|
|
append_cxx_flag_if_supported("-fabi-version=11" CMAKE_CXX_FLAGS)
|
2019-05-16 16:37:02 +00:00
|
|
|
endif()
|
|
|
|
|
endif()
|
2018-09-24 18:02:46 +00:00
|
|
|
|
2018-10-04 00:14:19 +00:00
|
|
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
2023-09-15 19:43:23 +00:00
|
|
|
set(CMAKE_LINK_WHAT_YOU_USE TRUE)
|
2018-10-04 00:14:19 +00:00
|
|
|
|
2018-03-01 20:01:44 +00:00
|
|
|
# One variable that determines whether the current cmake process is being run
|
2024-05-28 22:33:53 +00:00
|
|
|
# with the main Caffe2 library. This is useful for building modules - if modules
|
|
|
|
|
# are built with the main Caffe2 library then one does not need to do find
|
|
|
|
|
# caffe2 in the cmake script. One can usually guard it in some way like if(NOT
|
|
|
|
|
# CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) find_package(Caffe2 REQUIRED) endif()
|
2018-03-01 20:01:44 +00:00
|
|
|
set(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO ON)
|
|
|
|
|
|
2020-05-06 21:23:00 +00:00
|
|
|
# Googletest's cmake files are going to set it on once they are processed. Let's
|
|
|
|
|
# set it at the very beginning so that the entire build is deterministic.
|
|
|
|
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
|
|
|
|
|
2018-05-30 18:44:23 +00:00
|
|
|
if(NOT DEFINED BLAS_SET_BY_USER)
|
|
|
|
|
if(DEFINED BLAS)
|
|
|
|
|
set(BLAS_SET_BY_USER TRUE)
|
|
|
|
|
else()
|
|
|
|
|
message(STATUS "Not forcing any particular BLAS to be found")
|
|
|
|
|
set(BLAS_SET_BY_USER FALSE)
|
|
|
|
|
endif()
|
2024-05-28 22:33:53 +00:00
|
|
|
set(BLAS_SET_BY_USER
|
|
|
|
|
${BLAS_SET_BY_USER}
|
|
|
|
|
CACHE STRING
|
|
|
|
|
"Marks whether BLAS was manually set by user or auto-detected")
|
2018-05-25 14:38:50 +00:00
|
|
|
endif()
|
2018-05-30 18:44:23 +00:00
|
|
|
|
2018-07-10 01:04:25 +00:00
|
|
|
# Apple specific
|
2018-06-05 17:37:05 +00:00
|
|
|
if(APPLE)
|
2024-05-28 22:33:53 +00:00
|
|
|
# These lines are an attempt to make find_package(cuda) pick up libcuda.dylib,
|
|
|
|
|
# and not cuda.framework. It doesn't work all the time, but it seems to help
|
|
|
|
|
# for some users. TODO: replace this with a more robust fix
|
2018-06-05 17:37:05 +00:00
|
|
|
set(CMAKE_FIND_FRAMEWORK LAST)
|
|
|
|
|
set(CMAKE_FIND_APPBUNDLE LAST)
|
|
|
|
|
|
2018-07-10 01:04:25 +00:00
|
|
|
# Get clang version on macOS
|
2024-05-28 22:33:53 +00:00
|
|
|
execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version
|
|
|
|
|
OUTPUT_VARIABLE clang_full_version_string)
|
|
|
|
|
string(REGEX REPLACE "Apple (.*) version ([0-9]+\\.[0-9]+).*" "\\2"
|
|
|
|
|
CLANG_VERSION_STRING ${clang_full_version_string})
|
|
|
|
|
message(STATUS "CLANG_VERSION_STRING: " ${CLANG_VERSION_STRING})
|
2018-07-10 01:04:25 +00:00
|
|
|
|
|
|
|
|
# RPATH stuff
|
|
|
|
|
set(CMAKE_MACOSX_RPATH ON)
|
2021-02-19 20:54:32 +00:00
|
|
|
if(NOT IOS)
|
2022-04-27 19:21:57 +00:00
|
|
|
# Determine if we can link against MPSGraph
|
|
|
|
|
set(MPS_FOUND OFF)
|
2021-02-19 20:54:32 +00:00
|
|
|
execute_process(
|
2022-06-13 21:03:48 +00:00
|
|
|
COMMAND bash -c "xcrun --sdk macosx --show-sdk-version"
|
2022-05-13 18:28:53 +00:00
|
|
|
RESULT_VARIABLE _exit_code
|
|
|
|
|
OUTPUT_VARIABLE _macosx_sdk_version
|
2021-02-19 20:54:32 +00:00
|
|
|
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
2022-05-13 21:18:55 +00:00
|
|
|
if(_exit_code EQUAL 0)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(_MPS_supported_os_version OFF)
|
|
|
|
|
if(_macosx_sdk_version VERSION_GREATER_EQUAL 12.3)
|
|
|
|
|
set(_MPS_supported_os_version ON)
|
|
|
|
|
endif()
|
|
|
|
|
message(
|
|
|
|
|
STATUS
|
|
|
|
|
"sdk version: ${_macosx_sdk_version}, mps supported: ${_MPS_supported_os_version}"
|
|
|
|
|
)
|
|
|
|
|
execute_process(
|
|
|
|
|
COMMAND bash -c "xcrun --sdk macosx --show-sdk-path"
|
|
|
|
|
OUTPUT_VARIABLE _macosx_sdk_path
|
|
|
|
|
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
|
|
|
set(_SDK_SEARCH_PATH "${_macosx_sdk_path}/System/Library/Frameworks/")
|
|
|
|
|
set(_FRAMEWORK_SEARCH_PATH "/System/Library/Frameworks/")
|
|
|
|
|
|
|
|
|
|
find_library(
|
|
|
|
|
_MPS_fwrk_path_
|
|
|
|
|
NAMES MetalPerformanceShadersGraph MetalPerformanceShaders
|
|
|
|
|
PATHS ${_FRAMEWORK_SEARCH_PATH}
|
|
|
|
|
NO_DEFAULT_PATH)
|
|
|
|
|
find_library(
|
|
|
|
|
_MPS_sdk_path_
|
|
|
|
|
NAMES MetalPerformanceShadersGraph MetalPerformanceShaders
|
|
|
|
|
PATHS ${_SDK_SEARCH_PATH}
|
|
|
|
|
NO_DEFAULT_PATH)
|
|
|
|
|
|
|
|
|
|
if(_MPS_supported_os_version
|
|
|
|
|
AND _MPS_fwrk_path_
|
|
|
|
|
AND _MPS_sdk_path_)
|
|
|
|
|
set(MPS_FOUND ON)
|
|
|
|
|
message(STATUS "MPSGraph framework found")
|
|
|
|
|
else()
|
|
|
|
|
message(STATUS "MPSGraph framework not found")
|
|
|
|
|
endif()
|
2021-02-19 20:54:32 +00:00
|
|
|
else()
|
2022-05-13 21:18:55 +00:00
|
|
|
message(STATUS "MPS: unable to get MacOS sdk version")
|
2022-04-27 19:21:57 +00:00
|
|
|
message(STATUS "MPSGraph framework not found")
|
2021-02-19 20:54:32 +00:00
|
|
|
endif()
|
|
|
|
|
endif()
|
2018-06-12 02:45:40 +00:00
|
|
|
endif()
|
|
|
|
|
|
2021-01-13 16:40:11 +00:00
|
|
|
set(CPU_AARCH64 OFF)
|
|
|
|
|
set(CPU_INTEL OFF)
|
|
|
|
|
|
2021-03-25 19:30:41 +00:00
|
|
|
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|x86_64)")
|
|
|
|
|
set(CPU_INTEL ON)
|
|
|
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)")
|
|
|
|
|
set(CPU_AARCH64 ON)
|
2020-03-25 20:43:00 +00:00
|
|
|
endif()
|
2019-06-27 17:17:55 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
|
|
|
|
# tested and likely won't work without additional changes.
|
2020-09-25 19:35:42 +00:00
|
|
|
if(NOT LINUX AND NOT WIN32)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(USE_DISTRIBUTED
|
|
|
|
|
OFF
|
|
|
|
|
CACHE STRING "Use distributed")
|
|
|
|
|
# On macOS, if USE_DISTRIBUTED is enabled (specified by the user), then make
|
|
|
|
|
# Gloo build with the libuv transport.
|
2019-09-06 14:52:06 +00:00
|
|
|
if(APPLE AND USE_DISTRIBUTED)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(USE_LIBUV
|
|
|
|
|
ON
|
|
|
|
|
CACHE STRING "")
|
2019-09-05 14:08:12 +00:00
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# ---[ Options. Note to developers: if you add an option below, make sure you
|
|
|
|
|
# also add it to cmake/Summary.cmake so that the summary prints out the option
|
|
|
|
|
# values.
|
2018-01-29 18:00:43 +00:00
|
|
|
include(CMakeDependentOption)
|
2018-08-20 22:38:31 +00:00
|
|
|
option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
|
2018-09-07 22:06:30 +00:00
|
|
|
option(BUILD_BINARY "Build C++ binaries" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
option(BUILD_CUSTOM_PROTOBUF
|
|
|
|
|
"Build and use Caffe2's own protobuf under third_party" ON)
|
2017-08-09 16:16:46 +00:00
|
|
|
option(BUILD_PYTHON "Build Python binaries" ON)
|
[PyTorch] update CMake to build libtorch lite (#51419)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51419
## Summary
1. Add an option `BUILD_LITE_INTERPRETER` in `caffe2/CMakeLists.txt` and set `OFF` as default.
2. Update 'build_android.sh' with an argument to swtich `BUILD_LITE_INTERPRETER`, 'OFF' as default.
3. Add a mini demo app `lite_interpreter_demo` linked with `libtorch` library, which can be used for quick test.
## Test Plan
Built lite interpreter version of libtorch and test with Image Segmentation demo app ([android version](https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation)/[ios version](https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation))
### Android
1. **Prepare model**: Prepare the lite interpreter version of model by run the script below to generate the scripted model `deeplabv3_scripted.pt` and `deeplabv3_scripted.ptl`
```
import torch
model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True)
model.eval()
scripted_module = torch.jit.script(model)
# Export full jit version model (not compatible lite interpreter), leave it here for comparison
scripted_module.save("deeplabv3_scripted.pt")
# Export lite interpreter version model (compatible with lite interpreter)
scripted_module._save_for_lite_interpreter("deeplabv3_scripted.ptl")
```
2. **Build libtorch lite for android**: Build libtorch for android for all 4 android abis (armeabi-v7a, arm64-v8a, x86, x86_64) `BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh`. This pr is tested on Pixel 4 emulator with x86, so use cmd `BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh x86` to specify abi to save built time. After the build finish, it will show the library path:
```
...
BUILD SUCCESSFUL in 55s
134 actionable tasks: 22 executed, 112 up-to-date
+ find /Users/chenlai/pytorch/android -type f -name '*aar'
+ xargs ls -lah
-rw-r--r-- 1 chenlai staff 13M Feb 11 11:48 /Users/chenlai/pytorch/android/pytorch_android/build/outputs/aar/pytorch_android-release.aar
-rw-r--r-- 1 chenlai staff 36K Feb 9 16:45 /Users/chenlai/pytorch/android/pytorch_android_torchvision/build/outputs/aar/pytorch_android_torchvision-release.aar
```
3. **Use the PyTorch Android libraries built from source in the ImageSegmentation app**: Create a folder 'libs' in the path, the path from repository root will be `ImageSegmentation/app/libs`. Copy `pytorch_android-release` to the path `ImageSegmentation/app/libs/pytorch_android-release.aar`. Copy 'pytorch_android_torchvision` (downloaded from [here](https://oss.sonatype.org/#nexus-search;quick~torchvision_android)) to the path `ImageSegmentation/app/libs/pytorch_android_torchvision.aar` Update the `dependencies` part of `ImageSegmentation/app/build.gradle` to
```
dependencies {
implementation 'androidx.appcompat:appcompat:1.2.0'
implementation 'androidx.constraintlayout:constraintlayout:2.0.2'
testImplementation 'junit:junit:4.12'
androidTestImplementation 'androidx.test.ext:junit:1.1.2'
androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0'
implementation(name:'pytorch_android-release', ext:'aar')
implementation(name:'pytorch_android_torchvision', ext:'aar')
implementation 'com.android.support:appcompat-v7:28.0.0'
implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3'
}
```
Update `allprojects` part in `ImageSegmentation/build.gradle` to
```
allprojects {
repositories {
google()
jcenter()
flatDir {
dirs 'libs'
}
}
}
```
4. **Update model loader api**: Update `ImageSegmentation/app/src/main/java/org/pytorch/imagesegmentation/MainActivity.java` by
4.1 Add new import: `import org.pytorch.LiteModuleLoader;`
4.2 Replace the way to load pytorch lite model
```
// mModule = Module.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.pt"));
mModule = LiteModuleLoader.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.ptl"));
```
5. **Test app**: Build and run the ImageSegmentation app in Android Studio,

### iOS
1. **Prepare model**: Same as Android.
2. **Build libtorch lite for ios** `BUILD_PYTORCH_MOBILE=1 IOS_PLATFORM=SIMULATOR BUILD_LITE_INTERPRETER=1 ./scripts/build_ios.sh`
3. **Remove Cocoapods from the project**: run `pod deintegrate`
4. **Link ImageSegmentation demo app with the custom built library**:
Open your project in XCode, go to your project Target’s **Build Phases - Link Binaries With Libraries**, click the **+** sign and add all the library files located in `build_ios/install/lib`. Navigate to the project **Build Settings**, set the value **Header Search Paths** to `build_ios/install/include` and **Library Search Paths** to `build_ios/install/lib`.
In the build settings, search for **other linker flags**. Add a custom linker flag below
```
-all_load
```
Finally, disable bitcode for your target by selecting the Build Settings, searching for Enable Bitcode, and set the value to No.
**
5. Update library and api**
5.1 Update `TorchModule.mm``
To use the custom built libraries the project, replace `#import <LibTorch/LibTorch.h>` (in `TorchModule.mm`) which is needed when using LibTorch via Cocoapods with the code below:
```
//#import <LibTorch/LibTorch.h>
#include "ATen/ATen.h"
#include "caffe2/core/timer.h"
#include "caffe2/utils/string_utils.h"
#include "torch/csrc/autograd/grad_mode.h"
#include "torch/script.h"
#include <torch/csrc/jit/mobile/function.h>
#include <torch/csrc/jit/mobile/import.h>
#include <torch/csrc/jit/mobile/interpreter.h>
#include <torch/csrc/jit/mobile/module.h>
#include <torch/csrc/jit/mobile/observer.h>
```
5.2 Update `ViewController.swift`
```
// if let filePath = Bundle.main.path(forResource:
// "deeplabv3_scripted", ofType: "pt"),
// let module = TorchModule(fileAtPath: filePath) {
// return module
// } else {
// fatalError("Can't find the model file!")
// }
if let filePath = Bundle.main.path(forResource:
"deeplabv3_scripted", ofType: "ptl"),
let module = TorchModule(fileAtPath: filePath) {
return module
} else {
fatalError("Can't find the model file!")
}
```
### Unit test
Add `test/cpp/lite_interpreter`, with one unit test `test_cores.cpp` and a light model `sequence.ptl` to test `_load_for_mobile()`, `bc.find_method()` and `bc.forward()` functions.
### Size:
**With the change:**
Android:
x86: `pytorch_android-release.aar` (**13.8 MB**)
IOS:
`pytorch/build_ios/install/lib` (lib: **66 MB**):
```
(base) chenlai@chenlai-mp lib % ls -lh
total 135016
-rw-r--r-- 1 chenlai staff 3.3M Feb 15 20:45 libXNNPACK.a
-rw-r--r-- 1 chenlai staff 965K Feb 15 20:45 libc10.a
-rw-r--r-- 1 chenlai staff 4.6K Feb 15 20:45 libclog.a
-rw-r--r-- 1 chenlai staff 42K Feb 15 20:45 libcpuinfo.a
-rw-r--r-- 1 chenlai staff 39K Feb 15 20:45 libcpuinfo_internals.a
-rw-r--r-- 1 chenlai staff 1.5M Feb 15 20:45 libeigen_blas.a
-rw-r--r-- 1 chenlai staff 148K Feb 15 20:45 libfmt.a
-rw-r--r-- 1 chenlai staff 44K Feb 15 20:45 libpthreadpool.a
-rw-r--r-- 1 chenlai staff 166K Feb 15 20:45 libpytorch_qnnpack.a
-rw-r--r-- 1 chenlai staff 384B Feb 15 21:19 libtorch.a
-rw-r--r-- 1 chenlai staff **60M** Feb 15 20:47 libtorch_cpu.a
```
`pytorch/build_ios/install`:
```
(base) chenlai@chenlai-mp install % du -sh *
14M include
66M lib
2.8M share
```
**Master (baseline):**
Android:
x86: `pytorch_android-release.aar` (**16.2 MB**)
IOS:
`pytorch/build_ios/install/lib` (lib: **84 MB**):
```
(base) chenlai@chenlai-mp lib % ls -lh
total 172032
-rw-r--r-- 1 chenlai staff 3.3M Feb 17 22:18 libXNNPACK.a
-rw-r--r-- 1 chenlai staff 969K Feb 17 22:18 libc10.a
-rw-r--r-- 1 chenlai staff 4.6K Feb 17 22:18 libclog.a
-rw-r--r-- 1 chenlai staff 42K Feb 17 22:18 libcpuinfo.a
-rw-r--r-- 1 chenlai staff 1.5M Feb 17 22:18 libeigen_blas.a
-rw-r--r-- 1 chenlai staff 44K Feb 17 22:18 libpthreadpool.a
-rw-r--r-- 1 chenlai staff 166K Feb 17 22:18 libpytorch_qnnpack.a
-rw-r--r-- 1 chenlai staff 384B Feb 17 22:19 libtorch.a
-rw-r--r-- 1 chenlai staff 78M Feb 17 22:19 libtorch_cpu.a
```
`pytorch/build_ios/install`:
```
(base) chenlai@chenlai-mp install % du -sh *
14M include
84M lib
2.8M share
```
Test Plan: Imported from OSS
Reviewed By: iseeyuan
Differential Revision: D26518778
Pulled By: cccclai
fbshipit-source-id: 4503ffa1f150ecc309ed39fb0549e8bd046a3f9c
2021-02-21 09:41:55 +00:00
|
|
|
option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
|
2017-08-09 16:16:46 +00:00
|
|
|
option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
|
2018-03-19 21:36:53 +00:00
|
|
|
cmake_dependent_option(
|
2024-05-28 22:33:53 +00:00
|
|
|
CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
|
|
|
|
|
"BUILD_SHARED_LIBS AND BUILD_CUSTOM_PROTOBUF" OFF)
|
2018-01-29 18:00:43 +00:00
|
|
|
cmake_dependent_option(
|
2024-05-28 22:33:53 +00:00
|
|
|
CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON
|
|
|
|
|
"NOT BUILD_SHARED_LIBS" OFF)
|
2018-08-31 20:08:20 +00:00
|
|
|
option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
|
2023-04-24 00:58:21 +00:00
|
|
|
option(BUILD_AOT_INDUCTOR_TEST "Build C++ test binaries for aot-inductor" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
option(BUILD_STATIC_RUNTIME_BENCHMARK
|
|
|
|
|
"Build C++ binaries for static runtime benchmarks (need gbenchmark)" OFF)
|
|
|
|
|
option(
|
|
|
|
|
BUILD_MOBILE_BENCHMARK
|
|
|
|
|
"Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)"
|
|
|
|
|
OFF)
|
|
|
|
|
option(
|
|
|
|
|
BUILD_MOBILE_TEST
|
|
|
|
|
"Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)"
|
|
|
|
|
OFF)
|
2019-11-15 21:54:00 +00:00
|
|
|
option(BUILD_JNI "Build JNI bindings" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
option(BUILD_MOBILE_AUTOGRAD
|
|
|
|
|
"Build autograd function in mobile build (in development)" OFF)
|
|
|
|
|
cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on"
|
|
|
|
|
ON "BUILD_TEST" OFF)
|
2020-09-11 22:51:33 +00:00
|
|
|
option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
|
2022-08-23 01:09:29 +00:00
|
|
|
option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
|
2022-11-01 17:59:35 +00:00
|
|
|
option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
|
2020-03-23 21:48:42 +00:00
|
|
|
option(USE_TSAN "Use Thread Sanitizer" OFF)
|
2018-05-08 17:24:04 +00:00
|
|
|
option(USE_CUDA "Use CUDA" ON)
|
2024-06-06 01:41:06 +00:00
|
|
|
option(USE_XPU "Use XPU" ON)
|
2024-01-12 07:36:25 +00:00
|
|
|
cmake_dependent_option(
|
2024-05-28 22:33:53 +00:00
|
|
|
BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
|
|
|
|
|
"USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
|
2022-10-27 01:24:01 +00:00
|
|
|
cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
|
2018-04-25 23:22:54 +00:00
|
|
|
option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
|
|
|
|
|
cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
|
|
|
|
|
"USE_CUDNN" OFF)
|
|
|
|
|
cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
|
2024-08-22 07:57:30 +00:00
|
|
|
cmake_dependent_option(USE_CUDSS "Use cuDSS" ON "USE_CUDA" OFF)
|
2024-08-15 17:11:52 +00:00
|
|
|
# Binary builds will fail for cufile due to https://github.com/pytorch/builder/issues/1924
|
|
|
|
|
# Using TH_BINARY_BUILD to check whether is binary build.
|
|
|
|
|
# USE_ROCM is guarded against in Dependencies.cmake because USE_ROCM is not properly defined here
|
|
|
|
|
if(DEFINED ENV{TH_BINARY_BUILD})
|
|
|
|
|
cmake_dependent_option(USE_CUFILE "Use cuFile" OFF
|
|
|
|
|
"USE_CUDA AND NOT $ENV{TH_BINARY_BUILD} AND NOT WIN32" OFF)
|
|
|
|
|
else()
|
|
|
|
|
cmake_dependent_option(USE_CUFILE "Use cuFile" OFF "USE_CUDA AND NOT WIN32" OFF)
|
|
|
|
|
endif()
|
2019-07-25 14:08:23 +00:00
|
|
|
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
|
2021-01-22 08:06:18 +00:00
|
|
|
option(USE_KINETO "Use Kineto profiling library" ON)
|
2022-03-11 03:32:27 +00:00
|
|
|
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
|
2020-04-11 20:15:32 +00:00
|
|
|
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
|
2019-02-06 13:09:09 +00:00
|
|
|
option(USE_GFLAGS "Use GFLAGS" OFF)
|
|
|
|
|
option(USE_GLOG "Use GLOG" OFF)
|
2017-04-16 23:39:39 +00:00
|
|
|
option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
|
2021-04-15 07:41:51 +00:00
|
|
|
option(USE_MAGMA "Use MAGMA" ON)
|
2020-10-17 01:16:58 +00:00
|
|
|
option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
2021-08-27 16:23:45 +00:00
|
|
|
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
2018-01-22 21:47:44 +00:00
|
|
|
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
|
|
|
|
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
|
|
|
|
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
|
|
|
|
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
|
|
|
|
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
|
|
|
|
cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
|
|
|
|
|
OFF)
|
2018-02-09 02:57:48 +00:00
|
|
|
option(USE_NNAPI "Use NNAPI" OFF)
|
2017-03-27 15:42:36 +00:00
|
|
|
option(USE_NNPACK "Use NNPACK" ON)
|
2024-05-28 22:33:53 +00:00
|
|
|
cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"
|
|
|
|
|
OFF)
|
|
|
|
|
cmake_dependent_option(USE_NVRTC "Use NVRTC. Only available if USE_CUDA is on."
|
|
|
|
|
OFF "USE_CUDA" OFF)
|
2018-10-04 19:00:29 +00:00
|
|
|
option(USE_NUMPY "Use NumPy" ON)
|
2018-03-06 22:45:21 +00:00
|
|
|
option(USE_OBSERVERS "Use observers module." OFF)
|
2018-04-20 18:31:21 +00:00
|
|
|
option(USE_OPENCL "Use OpenCL" OFF)
|
2019-02-04 16:50:35 +00:00
|
|
|
option(USE_OPENMP "Use OpenMP for parallel code" ON)
|
2024-05-28 22:33:53 +00:00
|
|
|
option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build."
|
|
|
|
|
OFF)
|
2021-08-03 16:11:33 +00:00
|
|
|
|
2018-01-02 23:58:28 +00:00
|
|
|
option(USE_PROF "Use profiling" OFF)
|
2019-09-17 03:48:47 +00:00
|
|
|
option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
|
2017-08-28 22:23:56 +00:00
|
|
|
option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
|
2018-09-04 17:44:24 +00:00
|
|
|
option(USE_SYSTEM_EIGEN_INSTALL
|
|
|
|
|
"Use system Eigen instead of the one under third_party" OFF)
|
2020-10-13 23:15:56 +00:00
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_VALGRIND "Use Valgrind. Only available on Linux." ON
|
|
|
|
|
"LINUX" OFF)
|
2021-01-29 19:56:04 +00:00
|
|
|
|
|
|
|
|
if(NOT DEFINED USE_VULKAN)
|
2024-05-28 22:33:53 +00:00
|
|
|
cmake_dependent_option(USE_VULKAN "Use Vulkan GPU backend" ON "ANDROID" OFF)
|
2021-01-29 19:56:04 +00:00
|
|
|
endif()
|
|
|
|
|
|
2024-05-14 21:04:44 +00:00
|
|
|
option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
|
|
|
|
|
option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
|
2024-05-18 00:39:42 +00:00
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_LITE_AOTI "Include AOTI sources" OFF
|
|
|
|
|
"BUILD_LITE_INTERPRETER" OFF)
|
2021-03-03 21:42:54 +00:00
|
|
|
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
option(USE_VULKAN_RELAXED_PRECISION
|
|
|
|
|
"Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
|
2022-02-24 23:24:21 +00:00
|
|
|
# option USE_XNNPACK: try to enable xnnpack by default.
|
2022-12-30 11:19:23 +00:00
|
|
|
option(USE_XNNPACK "Use XNNPACK" ON)
|
[ROCm] Disabling Kernel Asserts for ROCm by default - fix and clean up and refactoring (#114660)
Related to #103973 #110532 #108404 #94891
**Context:**
As commented in https://github.com/pytorch/pytorch/blob/6ae0554d11b973930d7b8ec1e937b27ac961d7bf/cmake/Dependencies.cmake#L1198
Kernel asserts are enabled by default for CUDA and disabled for ROCm.
However it is somewhat broken, and Kernel assert was still enabled for ROCm.
Disabling kernel assert is also needed for users who do not have PCIe atomics support. These community users have verified that disabling the kernel assert in PyTorch/ROCm platform fixed their pytorch workflow, like torch.sum script, stable-diffusion. (see the related issues)
**Changes:**
This pull request serves the following purposes:
* Refactor and clean up the logic, make it simpler for ROCm to enable and disable Kernel Asserts
* Fix the bug that Kernel Asserts for ROCm was not disabled by default.
Specifically,
- Renamed `TORCH_DISABLE_GPU_ASSERTS` to `C10_USE_ROCM_KERNEL_ASSERT` for the following reasons:
(1) This variable only applies to ROCm.
(2) The new name is more align with #define CUDA_KERNEL_ASSERT function.
(3) With USE_ in front of the name, we can easily control it with environment variable to turn on and off this feature during build (e.g. `USE_ROCM_KERNEL_ASSERT=1 python setup.py develop` will enable kernel assert for ROCm build).
- Get rid of the `ROCM_FORCE_ENABLE_GPU_ASSERTS' to simplify the logic and make it easier to understand and maintain
- Added `#cmakedefine` to carry over the CMake variable to C++
**Tests:**
(1) build with default mode and verify that USE_ROCM_KERNEL_ASSERT is OFF(0), and kernel assert is disabled:
```
python setup.py develop
```
Verify CMakeCache.txt has correct value.
```
/xxxx/pytorch/build$ grep USE_ROCM_KERNEL_ASSERT CMakeCache.txt
USE_ROCM_KERNEL_ASSERT:BOOL=0
```
Tested the following code in ROCm build and CUDA build, and expected the return code differently.
```
subprocess.call([sys.executable, '-c', "import torch;torch._assert_async(torch.tensor(0,device='cuda'));torch.cuda.synchronize()"])
```
This piece of code is adapted from below unit test to get around the limitation that this unit test now was skipped for ROCm. (We will check to enable this unit test in the future)
```
python test/test_cuda_expandable_segments.py -k test_fixed_cuda_assert_async
```
Ran the following script, expecting r ==0 since the CUDA_KERNEL_ASSERT is defined as nothing:
```
>> import sys
>>> import subprocess
>>> r=subprocess.call([sys.executable, '-c', "import torch;torch._assert_async(torch.tensor(0,device='cuda'));torch.cuda.synchronize()"])
>>> r
0
```
(2) Enable the kernel assert by building with USE_ROCM_KERNEL_ASSERT=1, or USE_ROCM_KERNEL_ASSERT=ON
```
USE_ROCM_KERNEL_ASSERT=1 python setup.py develop
```
Verify `USE_ROCM_KERNEL_ASSERT` is `1`
```
/xxxx/pytorch/build$ grep USE_ROCM_KERNEL_ASSERT CMakeCache.txt
USE_ROCM_KERNEL_ASSERT:BOOL=1
```
Run the assert test, and expected return code not equal to 0.
```
>> import sys
>>> import subprocess
>>> r=subprocess.call([sys.executable, '-c', "import torch;torch._assert_async(torch.tensor(0,device='cuda'));torch.cuda.synchronize()"])
>>>/xxxx/pytorch/aten/src/ATen/native/hip/TensorCompare.hip:108: _assert_async_cuda_kernel: Device-side assertion `input[0] != 0' failed.
:0:rocdevice.cpp :2690: 2435301199202 us: [pid:206019 tid:0x7f6cf0a77700] Callback: Queue 0x7f64e8400000 aborting with error : HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception. code: 0x1016
>>> r
-6
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/114660
Approved by: https://github.com/jeffdaily, https://github.com/malfet, https://github.com/jithunnair-amd
2023-12-13 15:44:53 +00:00
|
|
|
option(USE_ROCM_KERNEL_ASSERT "Use Kernel Assert for ROCm" OFF)
|
2022-07-13 13:50:15 +00:00
|
|
|
# Ensure that an ITT build is the default for x86 CPUs
|
2024-05-28 22:33:53 +00:00
|
|
|
cmake_dependent_option(USE_ITT "Use Intel(R) VTune Profiler ITT functionality"
|
|
|
|
|
ON "CPU_INTEL" OFF)
|
|
|
|
|
# Ensure that an MKLDNN build is the default for x86 CPUs but optional for
|
|
|
|
|
# AArch64 (dependent on -DUSE_MKLDNN).
|
2022-07-13 13:50:15 +00:00
|
|
|
cmake_dependent_option(
|
2024-05-28 22:33:53 +00:00
|
|
|
USE_MKLDNN "Use MKLDNN. Only available on x86, x86_64, and AArch64."
|
|
|
|
|
"${CPU_INTEL}" "CPU_INTEL OR CPU_AARCH64" OFF)
|
2021-05-20 14:42:48 +00:00
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_MKLDNN_ACL "Use Compute Library for the Arm architecture." OFF
|
|
|
|
|
"USE_MKLDNN AND CPU_AARCH64" OFF)
|
2019-06-27 17:17:55 +00:00
|
|
|
set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
|
2024-05-28 22:33:53 +00:00
|
|
|
cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN"
|
|
|
|
|
OFF)
|
2022-03-07 19:32:33 +00:00
|
|
|
option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
|
2018-09-06 15:40:57 +00:00
|
|
|
option(USE_DISTRIBUTED "Use distributed" ON)
|
2018-09-05 23:45:48 +00:00
|
|
|
cmake_dependent_option(
|
2024-05-28 22:33:53 +00:00
|
|
|
USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
|
|
|
|
|
"USE_DISTRIBUTED" OFF)
|
2022-07-12 14:45:44 +00:00
|
|
|
cmake_dependent_option(
|
2024-05-28 22:33:53 +00:00
|
|
|
USE_UCC "Use UCC. Only available if USE_DISTRIBUTED is on." OFF
|
|
|
|
|
"USE_DISTRIBUTED" OFF)
|
|
|
|
|
cmake_dependent_option(USE_SYSTEM_UCC "Use system-wide UCC" OFF "USE_UCC" OFF)
|
|
|
|
|
cmake_dependent_option(USE_C10D_UCC "USE C10D UCC" ON "USE_DISTRIBUTED;USE_UCC"
|
|
|
|
|
OFF)
|
2018-09-06 15:40:57 +00:00
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON
|
|
|
|
|
"USE_DISTRIBUTED" OFF)
|
2021-05-07 20:34:43 +00:00
|
|
|
cmake_dependent_option(
|
2022-03-22 23:06:04 +00:00
|
|
|
USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
|
2021-05-07 20:34:43 +00:00
|
|
|
"USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
|
2021-06-15 09:00:08 +00:00
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
|
|
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
|
|
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
|
2020-04-30 17:57:48 +00:00
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
|
2024-09-23 06:42:57 +00:00
|
|
|
"USE_DISTRIBUTED AND NOT WIN32" OFF)
|
2020-02-21 23:40:04 +00:00
|
|
|
option(ONNX_ML "Enable traditional ONNX ML API." ON)
|
2020-04-30 13:50:03 +00:00
|
|
|
option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
option(BUILD_LIBTORCH_CPU_WITH_DEBUG
|
|
|
|
|
"Enable RelWithDebInfo for libtorch_cpu target only" OFF)
|
|
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_CCACHE "Attempt using CCache to wrap the compilation" ON "UNIX" OFF)
|
2021-04-30 04:18:59 +00:00
|
|
|
option(WERROR "Build with -Werror supported by the compiler" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
option(
|
|
|
|
|
DEBUG_CUDA
|
|
|
|
|
"When compiling DEBUG, also attempt to compile CUDA with debug flags (may cause nvcc to OOM)"
|
|
|
|
|
OFF)
|
2021-09-17 17:14:40 +00:00
|
|
|
option(USE_COREML_DELEGATE "Use the CoreML backend through delegate APIs" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
option(USE_PER_OPERATOR_HEADERS
|
|
|
|
|
"Whether ATen should generate separate headers for each operator" ON)
|
2022-09-13 16:36:57 +00:00
|
|
|
cmake_dependent_option(
|
2024-05-28 22:33:53 +00:00
|
|
|
BUILD_LAZY_TS_BACKEND
|
|
|
|
|
"Build the lazy Torchscript backend, not compatible with mobile builds" ON
|
|
|
|
|
"NOT INTERN_BUILD_MOBILE" OFF)
|
|
|
|
|
cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
|
|
|
|
|
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
|
|
|
|
|
OFF "USE_CUDA" OFF)
|
2025-01-23 18:50:58 +00:00
|
|
|
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
|
|
|
|
"CPU_AARCH64" OFF)
|
2021-09-17 17:14:40 +00:00
|
|
|
|
2023-06-27 08:53:23 +00:00
|
|
|
option(USE_MIMALLOC "Use mimalloc" OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
# Enable third party mimalloc library to improve memory allocation performance
|
|
|
|
|
# on Windows.
|
[Windows][cpu] mkl use mimalloc as allocator on Windows (#138419)
We did a lot of optimization for PyTorch Windows, and we got good progress of it. But still some models have performance gap between PyTorch Windows and PyTorch Linux. Ref: https://pytorch.org/blog/performance-boost-windows/#conclusion
From the blog conclusion, we found the `ResNet50` is typical case of it.
Let's focus on the `ResNet50`, and collect the profiling log:
```cmd
(nightly) D:\xu_git\dnnl_cb>python test_script_resnet50.py
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
model_inference 3.91% 682.427ms 100.00% 17.448s 17.448s 1
aten::conv2d 0.18% 30.906ms 64.79% 11.305s 2.133ms 5300
aten::convolution 0.45% 78.031ms 64.62% 11.275s 2.127ms 5300
aten::_convolution 0.30% 51.670ms 64.17% 11.196s 2.113ms 5300
aten::mkldnn_convolution 63.58% 11.093s 63.87% 11.145s 2.103ms 5300
aten::batch_norm 0.13% 23.536ms 20.10% 3.506s 661.580us 5300
aten::_batch_norm_impl_index 0.28% 49.486ms 19.96% 3.483s 657.139us 5300
aten::native_batch_norm 19.26% 3.360s 19.64% 3.427s 646.615us 5300
aten::max_pool2d 0.01% 1.038ms 5.84% 1.018s 10.181ms 100
aten::max_pool2d_with_indices 5.83% 1.017s 5.83% 1.017s 10.171ms 100
aten::add_ 3.38% 588.907ms 3.38% 588.907ms 85.349us 6900
aten::relu_ 0.35% 60.358ms 1.67% 292.155ms 59.624us 4900
aten::clamp_min_ 1.33% 231.797ms 1.33% 231.797ms 47.306us 4900
aten::empty 0.46% 80.195ms 0.46% 80.195ms 1.513us 53000
aten::linear 0.01% 927.300us 0.23% 39.353ms 393.532us 100
aten::addmm 0.20% 35.379ms 0.21% 37.016ms 370.155us 100
aten::empty_like 0.12% 20.455ms 0.17% 29.976ms 5.656us 5300
aten::as_strided_ 0.11% 18.830ms 0.11% 18.830ms 3.553us 5300
aten::adaptive_avg_pool2d 0.00% 419.900us 0.08% 14.265ms 142.647us 100
aten::mean 0.01% 1.737ms 0.08% 13.845ms 138.448us 100
aten::sum 0.05% 8.113ms 0.05% 8.648ms 86.479us 100
aten::resize_ 0.03% 5.182ms 0.03% 5.182ms 0.978us 5300
aten::div_ 0.01% 1.445ms 0.02% 3.460ms 34.600us 100
aten::to 0.00% 337.000us 0.01% 2.015ms 20.154us 100
aten::_to_copy 0.01% 977.500us 0.01% 1.678ms 16.784us 100
aten::copy_ 0.01% 1.474ms 0.01% 1.474ms 7.371us 200
aten::t 0.00% 775.900us 0.01% 1.410ms 14.104us 100
aten::flatten 0.00% 420.900us 0.01% 1.311ms 13.106us 100
aten::view 0.01% 889.700us 0.01% 889.700us 8.897us 100
aten::transpose 0.00% 410.700us 0.00% 634.500us 6.345us 100
aten::expand 0.00% 496.800us 0.00% 566.800us 5.668us 100
aten::fill_ 0.00% 534.800us 0.00% 534.800us 5.348us 100
aten::as_strided 0.00% 293.800us 0.00% 293.800us 1.469us 200
aten::empty_strided 0.00% 241.700us 0.00% 241.700us 2.417us 100
aten::resolve_conj 0.00% 54.800us 0.00% 54.800us 0.274us 200
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 17.448s
Execution time: 20.02380895614624
```
We found the major kernel consume CPU resource is `aten::mkldnn_convolution`. It was dispatched to `MKLDNN`.
Acturally, we had optimized memory allocation via integrated mimalloc to pytorch C10 module. It helps PyTorch Windows boost a lot, but it does not cover `MKL` and `MKLDNN`'s intermediary temporary memory.
We still have potential to improve PyTorch Windows performance via optimize `MKL` and `MKLDNN`'s intermediary temporary memory.
So, I discussed with Intel MKL team, and get a method to register high performance memory allocation API to MKL, and it would help MKL to boost memory performance. Please check the online document: https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-windows/2023-0/redefining-memory-functions.html
This PR is optimize MKL memory alloction performance on Windows, via register mi_malloc to MKL. PR Changes:
1. Add cmake option: `USE_MIMALLOC_ON_MKL`, It is sub-option of `USE_MIMALLOC`.
2. Wrap and export mi_malloc APIs in C10, when `USE_MIMALLOC_ON_MKL` is `ON`.
3. Add MklAllocationHelp.cpp to register allocation APIs to MKL, when `USE_MIMALLOC_ON_MKL` is `ON`.
For `oneDNN`, it is still tracking in this proposal: https://github.com/oneapi-src/oneDNN/issues/1898
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138419
Approved by: https://github.com/jgong5, https://github.com/ezyang
2024-10-24 05:29:47 +00:00
|
|
|
option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
|
2023-06-27 08:53:23 +00:00
|
|
|
if(WIN32)
|
|
|
|
|
set(USE_MIMALLOC ON)
|
2024-10-30 21:09:21 +00:00
|
|
|
|
|
|
|
|
# Not enable USE_MIMALLOC_ON_MKL due to it caused issue:
|
|
|
|
|
# https://github.com/pytorch/pytorch/issues/138994
|
|
|
|
|
# Will turn on when we can fix USE_STATIC_MKL lost functionality:
|
|
|
|
|
# https://github.com/pytorch/pytorch/pull/138996
|
|
|
|
|
# set(USE_MIMALLOC_ON_MKL ON)
|
2023-06-27 08:53:23 +00:00
|
|
|
endif()
|
2021-03-18 21:17:23 +00:00
|
|
|
|
|
|
|
|
if(USE_CCACHE)
|
|
|
|
|
find_program(CCACHE_PROGRAM ccache)
|
|
|
|
|
if(CCACHE_PROGRAM)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(CMAKE_C_COMPILER_LAUNCHER
|
|
|
|
|
"${CCACHE_PROGRAM}"
|
|
|
|
|
CACHE STRING "C compiler launcher")
|
|
|
|
|
set(CMAKE_CXX_COMPILER_LAUNCHER
|
|
|
|
|
"${CCACHE_PROGRAM}"
|
|
|
|
|
CACHE STRING "CXX compiler launcher")
|
|
|
|
|
set(CMAKE_CUDA_COMPILER_LAUNCHER
|
|
|
|
|
"${CCACHE_PROGRAM}"
|
|
|
|
|
CACHE STRING "CUDA compiler launcher")
|
2021-03-18 21:17:23 +00:00
|
|
|
else()
|
2024-05-28 22:33:53 +00:00
|
|
|
message(
|
|
|
|
|
STATUS
|
|
|
|
|
"Could not find ccache. Consider installing ccache to speed up compilation."
|
|
|
|
|
)
|
2021-03-18 21:17:23 +00:00
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
|
2020-09-25 19:35:42 +00:00
|
|
|
# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
|
2020-09-30 19:38:11 +00:00
|
|
|
# On Windows platform, if user does not install libuv in build conda env and
|
|
|
|
|
# does not set libuv_ROOT environment variable. Set USE_DISTRIBUTED to OFF.
|
2020-09-25 19:35:42 +00:00
|
|
|
if(WIN32)
|
|
|
|
|
set(USE_TENSORPIPE OFF)
|
|
|
|
|
message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF")
|
2025-01-23 18:50:58 +00:00
|
|
|
set(USE_KLEIDIAI OFF)
|
|
|
|
|
message(WARNING "KleidiAI cannot be used on Windows. Set it to OFF")
|
2020-09-30 19:38:11 +00:00
|
|
|
|
|
|
|
|
if(USE_DISTRIBUTED AND NOT DEFINED ENV{libuv_ROOT})
|
|
|
|
|
find_library(
|
|
|
|
|
libuv_tmp_LIBRARY
|
|
|
|
|
NAMES uv libuv
|
2020-11-30 19:14:04 +00:00
|
|
|
HINTS $ENV{CONDA_PREFIX}\\Library $ENV{PREFIX}\\Library
|
2020-09-30 19:38:11 +00:00
|
|
|
PATH_SUFFIXES lib
|
|
|
|
|
NO_DEFAULT_PATH)
|
2020-11-30 19:14:04 +00:00
|
|
|
if(NOT libuv_tmp_LIBRARY)
|
2020-09-30 19:38:11 +00:00
|
|
|
set(USE_DISTRIBUTED OFF)
|
|
|
|
|
set(USE_GLOO OFF)
|
|
|
|
|
message(
|
2024-05-28 22:33:53 +00:00
|
|
|
WARNING
|
|
|
|
|
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
|
|
|
|
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
|
|
|
|
)
|
2020-09-30 19:38:11 +00:00
|
|
|
else()
|
2020-11-30 19:14:04 +00:00
|
|
|
set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
|
2020-09-30 19:38:11 +00:00
|
|
|
endif()
|
|
|
|
|
endif()
|
2020-09-25 19:35:42 +00:00
|
|
|
endif()
|
|
|
|
|
|
2021-05-07 20:34:43 +00:00
|
|
|
if(USE_GLOO_WITH_OPENSSL)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(USE_TCP_OPENSSL_LOAD
|
|
|
|
|
ON
|
|
|
|
|
CACHE STRING "")
|
2021-05-07 20:34:43 +00:00
|
|
|
endif()
|
|
|
|
|
|
2020-04-24 03:40:16 +00:00
|
|
|
# Linux distributions do not want too many embedded sources, in that sense we
|
2024-05-28 22:33:53 +00:00
|
|
|
# need to be able to build pytorch with an (almost) empty third_party directory.
|
2020-04-24 03:40:16 +00:00
|
|
|
# USE_SYSTEM_LIBS is a shortcut variable to toggle all the # USE_SYSTEM_*
|
|
|
|
|
# variables on. Individual USE_SYSTEM_* variables can be toggled with
|
|
|
|
|
# USE_SYSTEM_LIBS being "OFF".
|
|
|
|
|
option(USE_SYSTEM_LIBS "Use all available system-provided libraries." OFF)
|
|
|
|
|
option(USE_SYSTEM_CPUINFO "Use system-provided cpuinfo." OFF)
|
|
|
|
|
option(USE_SYSTEM_SLEEF "Use system-provided sleef." OFF)
|
2020-04-27 16:34:52 +00:00
|
|
|
option(USE_SYSTEM_GLOO "Use system-provided gloo." OFF)
|
|
|
|
|
option(USE_SYSTEM_FP16 "Use system-provided fp16." OFF)
|
2021-05-25 22:08:56 +00:00
|
|
|
option(USE_SYSTEM_PYBIND11 "Use system-provided PyBind11." OFF)
|
2020-04-27 16:34:52 +00:00
|
|
|
option(USE_SYSTEM_PTHREADPOOL "Use system-provided pthreadpool." OFF)
|
|
|
|
|
option(USE_SYSTEM_PSIMD "Use system-provided psimd." OFF)
|
|
|
|
|
option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
|
|
|
|
|
option(USE_SYSTEM_BENCHMARK "Use system-provided google benchmark." OFF)
|
2020-04-29 16:20:15 +00:00
|
|
|
option(USE_SYSTEM_ONNX "Use system-provided onnx." OFF)
|
|
|
|
|
option(USE_SYSTEM_XNNPACK "Use system-provided xnnpack." OFF)
|
2024-10-19 04:26:01 +00:00
|
|
|
OPTION(USE_SYSTEM_NVTX "Use system-provided nvtx." OFF)
|
2021-05-03 17:03:07 +00:00
|
|
|
option(USE_GOLD_LINKER "Use ld.gold to link" OFF)
|
2020-04-24 03:40:16 +00:00
|
|
|
if(USE_SYSTEM_LIBS)
|
|
|
|
|
set(USE_SYSTEM_CPUINFO ON)
|
|
|
|
|
set(USE_SYSTEM_SLEEF ON)
|
2020-04-27 16:34:52 +00:00
|
|
|
set(USE_SYSTEM_GLOO ON)
|
2020-04-24 03:40:16 +00:00
|
|
|
set(BUILD_CUSTOM_PROTOBUF OFF)
|
2020-04-27 16:34:52 +00:00
|
|
|
set(USE_SYSTEM_EIGEN_INSTALL ON)
|
|
|
|
|
set(USE_SYSTEM_FP16 ON)
|
|
|
|
|
set(USE_SYSTEM_PTHREADPOOL ON)
|
|
|
|
|
set(USE_SYSTEM_PSIMD ON)
|
|
|
|
|
set(USE_SYSTEM_FXDIV ON)
|
|
|
|
|
set(USE_SYSTEM_BENCHMARK ON)
|
2020-04-29 16:20:15 +00:00
|
|
|
set(USE_SYSTEM_ONNX ON)
|
|
|
|
|
set(USE_SYSTEM_XNNPACK ON)
|
2021-05-25 22:08:56 +00:00
|
|
|
set(USE_SYSTEM_PYBIND11 ON)
|
2023-07-04 19:08:47 +00:00
|
|
|
if(USE_NCCL)
|
|
|
|
|
set(USE_SYSTEM_NCCL ON)
|
|
|
|
|
endif()
|
2024-10-19 04:26:01 +00:00
|
|
|
set(USE_SYSTEM_NVTX ON)
|
2020-04-24 03:40:16 +00:00
|
|
|
endif()
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# /Z7 override option When generating debug symbols, CMake default to use the
|
|
|
|
|
# flag /Zi. However, it is not compatible with sccache. So we rewrite it off.
|
2019-04-03 15:19:45 +00:00
|
|
|
# But some users don't use sccache; this override is for them.
|
2019-07-29 15:04:33 +00:00
|
|
|
cmake_dependent_option(
|
2024-05-28 22:33:53 +00:00
|
|
|
MSVC_Z7_OVERRIDE
|
|
|
|
|
"Work around sccache bug by replacing /Zi and /ZI with /Z7 when using MSVC (if you are not using sccache, you can turn this OFF)"
|
|
|
|
|
ON
|
|
|
|
|
"MSVC"
|
|
|
|
|
OFF)
|
2019-04-03 15:19:45 +00:00
|
|
|
|
2020-04-29 16:20:15 +00:00
|
|
|
if(NOT USE_SYSTEM_ONNX)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(ONNX_NAMESPACE
|
|
|
|
|
"onnx_torch"
|
|
|
|
|
CACHE
|
|
|
|
|
STRING
|
|
|
|
|
"A namespace for ONNX; needed to build with other frameworks that share ONNX."
|
|
|
|
|
)
|
2021-03-31 15:27:38 +00:00
|
|
|
else()
|
2024-05-28 22:33:53 +00:00
|
|
|
set(ONNX_NAMESPACE
|
|
|
|
|
"onnx"
|
|
|
|
|
CACHE
|
|
|
|
|
STRING
|
|
|
|
|
"A namespace for ONNX; needed to build with other frameworks that share ONNX."
|
|
|
|
|
)
|
2020-04-29 16:20:15 +00:00
|
|
|
endif()
|
2024-05-28 22:33:53 +00:00
|
|
|
set(SELECTED_OP_LIST
|
|
|
|
|
""
|
|
|
|
|
CACHE
|
|
|
|
|
STRING
|
|
|
|
|
"Path to the yaml file that contains the list of operators to include for custom build. Include all operators by default."
|
|
|
|
|
)
|
2022-03-01 22:54:42 +00:00
|
|
|
option(
|
2024-05-28 22:33:53 +00:00
|
|
|
STATIC_DISPATCH_BACKEND
|
|
|
|
|
"Name of the backend for which static dispatch code is generated, e.g.: CPU."
|
|
|
|
|
"")
|
2021-10-09 03:10:54 +00:00
|
|
|
option(
|
2024-05-28 22:33:53 +00:00
|
|
|
USE_LIGHTWEIGHT_DISPATCH
|
|
|
|
|
"Enable codegen unboxing for ATen ops, need to work with static dispatch in order to work properly."
|
2021-10-09 03:10:54 +00:00
|
|
|
OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
if(USE_LIGHTWEIGHT_DISPATCH AND NOT STATIC_DISPATCH_BACKEND)
|
|
|
|
|
message(
|
|
|
|
|
FATAL_ERROR
|
|
|
|
|
"Need to enable static dispatch after enabling USE_LIGHTWEIGHT_DISPATCH.")
|
|
|
|
|
endif()
|
|
|
|
|
option(TRACING_BASED
|
|
|
|
|
"Master flag to build Lite Interpreter with tracing build option" OFF)
|
2022-12-20 21:50:39 +00:00
|
|
|
option(BUILD_EXECUTORCH "Master flag to build Executorch" ON)
|
2024-05-28 22:33:53 +00:00
|
|
|
# This is a fix for a rare build issue on Ubuntu: symbol lookup error:
|
|
|
|
|
# miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol:
|
|
|
|
|
# mkl_blas_dsyrk
|
2019-09-04 20:38:08 +00:00
|
|
|
# https://software.intel.com/en-us/articles/symbol-lookup-error-when-linking-intel-mkl-with-gcc-on-ubuntu
|
|
|
|
|
if(LINUX)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(CMAKE_SHARED_LINKER_FLAGS
|
|
|
|
|
"${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed")
|
2024-07-30 18:16:04 +00:00
|
|
|
|
|
|
|
|
set(ENV_LDFLAGS "$ENV{LDFLAGS}")
|
|
|
|
|
string(STRIP "${ENV_LDFLAGS}" ENV_LDFLAGS)
|
|
|
|
|
# Do not append linker flags passed via env var if they already there
|
|
|
|
|
if(NOT ${CMAKE_SHARED_LINKER_FLAGS} MATCHES "${ENV_LDFLAGS}")
|
|
|
|
|
set(CMAKE_SHARED_LINKER_FLAGS
|
|
|
|
|
"${CMAKE_SHARED_LINKER_FLAGS} ${ENV_LDFLAGS}")
|
|
|
|
|
endif()
|
2019-09-04 20:38:08 +00:00
|
|
|
endif()
|
|
|
|
|
|
2020-03-25 20:43:00 +00:00
|
|
|
if(MSVC)
|
2024-05-28 22:33:53 +00:00
|
|
|
# MSVC by default does not apply the correct __cplusplus version as specified
|
|
|
|
|
# by the C++ standard because MSVC is not a completely compliant
|
|
|
|
|
# implementation. This option forces MSVC to use the appropriate value given
|
|
|
|
|
# the requested --std option. This fixes a compilation issue mismatch between
|
|
|
|
|
# GCC/Clang and MSVC.
|
2023-11-03 20:41:13 +00:00
|
|
|
#
|
2024-05-28 22:33:53 +00:00
|
|
|
# See: *
|
|
|
|
|
# https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
|
2023-11-03 20:41:13 +00:00
|
|
|
# * https://en.cppreference.com/w/cpp/preprocessor/replace#Predefined_macros
|
|
|
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
|
|
|
|
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /Zc:__cplusplus")
|
|
|
|
|
|
2021-08-09 22:54:17 +00:00
|
|
|
set(CMAKE_NINJA_CMCLDEPS_RC OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
foreach(
|
|
|
|
|
flag_var
|
|
|
|
|
CMAKE_C_FLAGS
|
|
|
|
|
CMAKE_C_FLAGS_DEBUG
|
|
|
|
|
CMAKE_C_FLAGS_RELEASE
|
|
|
|
|
CMAKE_C_FLAGS_MINSIZEREL
|
|
|
|
|
CMAKE_C_FLAGS_RELWITHDEBINFO
|
|
|
|
|
CMAKE_CXX_FLAGS
|
|
|
|
|
CMAKE_CXX_FLAGS_DEBUG
|
|
|
|
|
CMAKE_CXX_FLAGS_RELEASE
|
|
|
|
|
CMAKE_CXX_FLAGS_MINSIZEREL
|
|
|
|
|
CMAKE_CXX_FLAGS_RELWITHDEBINFO)
|
2019-08-30 14:09:30 +00:00
|
|
|
# Replace /Zi and /ZI with /Z7
|
|
|
|
|
if(MSVC_Z7_OVERRIDE)
|
2019-04-03 15:19:45 +00:00
|
|
|
if(${flag_var} MATCHES "/Z[iI]")
|
|
|
|
|
string(REGEX REPLACE "/Z[iI]" "/Z7" ${flag_var} "${${flag_var}}")
|
|
|
|
|
endif(${flag_var} MATCHES "/Z[iI]")
|
2019-08-30 14:09:30 +00:00
|
|
|
endif(MSVC_Z7_OVERRIDE)
|
|
|
|
|
|
2020-03-25 20:43:00 +00:00
|
|
|
if(${CAFFE2_USE_MSVC_STATIC_RUNTIME})
|
2019-08-30 14:09:30 +00:00
|
|
|
if(${flag_var} MATCHES "/MD")
|
|
|
|
|
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
|
|
|
|
|
endif(${flag_var} MATCHES "/MD")
|
|
|
|
|
else()
|
|
|
|
|
if(${flag_var} MATCHES "/MT")
|
|
|
|
|
string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
|
|
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
|
|
|
|
|
# /bigobj increases number of sections in .obj file, which is needed to link
|
2024-05-28 22:33:53 +00:00
|
|
|
# against libraries in Python 2.7 under Windows For Visual Studio
|
|
|
|
|
# generators, if /MP is not added, then we may need to add /MP to the flags.
|
Correct /MP usage in MSVC (#33120)
Summary:
## Several flags
`/MP[M]`: It is a flag for the compiler `cl`. It leads to object-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC.
`/maxcpucount:[M]`: It is a flag for the generator `msbuild`. It leads to project-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC.
`/p:CL_MPCount=[M]`: It is a flag for the generator `msbuild`. It leads the generator to pass `/MP[M]` to the compiler.
`/j[M]`: It is a flag for the generator `ninja`. It leads to object-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC.
## Reason for the change
1. Object-level multiprocessing is preferred over project-level multiprocessing.
2. ~For ninja, we don't need to set `/MP` otherwise M * M processes will be spawned.~ Actually, it is not correct because in ninja configs, there are only one source file in the command. Therefore, the `/MP` switch should be useless.
3. For msbuild, if it is called through Python configuration scripts, then `/p:CL_MPCount=[M]` will be added, otherwise, we add `/MP` to `CMAKE_CXX_FLAGS`.
4. ~It may be a possible fix for https://github.com/pytorch/pytorch/issues/28271, https://github.com/pytorch/pytorch/issues/27463 and https://github.com/pytorch/pytorch/issues/25393. Because `/MP` is also passed to `nvcc`.~ It is probably not true. Because `/MP` should not be effective given there is only one source file per command.
## Reference
1. https://docs.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=vs-2019
2. https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows
3. https://blog.kitware.com/cmake-building-with-all-your-cores/
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33120
Differential Revision: D19817227
Pulled By: ezyang
fbshipit-source-id: f8d01f835016971729c7a8d8a0d1cb8a8c2c6a5f
2020-02-10 19:26:19 +00:00
|
|
|
# For other generators like ninja, we don't need to add /MP because it is
|
|
|
|
|
# already handled by the generator itself.
|
2024-05-28 22:33:53 +00:00
|
|
|
if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT ${flag_var} MATCHES
|
|
|
|
|
"/MP")
|
Correct /MP usage in MSVC (#33120)
Summary:
## Several flags
`/MP[M]`: It is a flag for the compiler `cl`. It leads to object-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC.
`/maxcpucount:[M]`: It is a flag for the generator `msbuild`. It leads to project-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC.
`/p:CL_MPCount=[M]`: It is a flag for the generator `msbuild`. It leads the generator to pass `/MP[M]` to the compiler.
`/j[M]`: It is a flag for the generator `ninja`. It leads to object-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC.
## Reason for the change
1. Object-level multiprocessing is preferred over project-level multiprocessing.
2. ~For ninja, we don't need to set `/MP` otherwise M * M processes will be spawned.~ Actually, it is not correct because in ninja configs, there are only one source file in the command. Therefore, the `/MP` switch should be useless.
3. For msbuild, if it is called through Python configuration scripts, then `/p:CL_MPCount=[M]` will be added, otherwise, we add `/MP` to `CMAKE_CXX_FLAGS`.
4. ~It may be a possible fix for https://github.com/pytorch/pytorch/issues/28271, https://github.com/pytorch/pytorch/issues/27463 and https://github.com/pytorch/pytorch/issues/25393. Because `/MP` is also passed to `nvcc`.~ It is probably not true. Because `/MP` should not be effective given there is only one source file per command.
## Reference
1. https://docs.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=vs-2019
2. https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows
3. https://blog.kitware.com/cmake-building-with-all-your-cores/
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33120
Differential Revision: D19817227
Pulled By: ezyang
fbshipit-source-id: f8d01f835016971729c7a8d8a0d1cb8a8c2c6a5f
2020-02-10 19:26:19 +00:00
|
|
|
set(${flag_var} "${${flag_var}} /MP /bigobj")
|
|
|
|
|
else()
|
|
|
|
|
set(${flag_var} "${${flag_var}} /bigobj")
|
|
|
|
|
endif()
|
2019-08-30 14:09:30 +00:00
|
|
|
endforeach(flag_var)
|
|
|
|
|
|
2020-07-09 21:12:15 +00:00
|
|
|
foreach(flag_var
|
2024-05-28 22:33:53 +00:00
|
|
|
CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
|
|
|
|
|
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
|
2020-06-09 21:09:38 +00:00
|
|
|
if(${flag_var} MATCHES "/Z[iI7]")
|
|
|
|
|
string(REGEX REPLACE "/Z[iI7]" "" ${flag_var} "${${flag_var}}")
|
|
|
|
|
endif()
|
|
|
|
|
endforeach(flag_var)
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
foreach(
|
|
|
|
|
flag_var
|
|
|
|
|
CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
|
|
|
|
|
CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
|
|
|
|
|
CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
|
|
|
|
|
CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
|
|
|
|
|
CMAKE_SHARED_LINKER_FLAGS_DEBUG
|
|
|
|
|
CMAKE_STATIC_LINKER_FLAGS_DEBUG
|
|
|
|
|
CMAKE_EXE_LINKER_FLAGS_DEBUG
|
|
|
|
|
CMAKE_MODULE_LINKER_FLAGS_DEBUG)
|
2021-09-14 16:40:33 +00:00
|
|
|
# Switch off incremental linking in debug/relwithdebinfo builds
|
2024-05-28 22:33:53 +00:00
|
|
|
if(${flag_var} MATCHES "/INCREMENTAL" AND NOT ${flag_var} MATCHES
|
|
|
|
|
"/INCREMENTAL:NO")
|
|
|
|
|
string(REGEX REPLACE "/INCREMENTAL" "/INCREMENTAL:NO" ${flag_var}
|
|
|
|
|
"${${flag_var}}")
|
2019-04-03 15:19:45 +00:00
|
|
|
endif()
|
|
|
|
|
endforeach(flag_var)
|
2019-08-30 14:09:30 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
|
|
|
|
|
CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS)
|
2021-07-23 16:27:24 +00:00
|
|
|
string(APPEND ${flag_var} " /ignore:4049 /ignore:4217 /ignore:4099")
|
2019-08-30 14:09:30 +00:00
|
|
|
endforeach(flag_var)
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
foreach(flag_var CMAKE_SHARED_LINKER_FLAGS)
|
|
|
|
|
# https://github.com/pytorch/pytorch/issues/91933: Don't set the manifest
|
|
|
|
|
# filename explicitly helps fix the linker error when linking
|
|
|
|
|
# torch_python.dll. The manifest file would still be there in the correct
|
|
|
|
|
# format torch_python.dll.manifest
|
2023-01-11 22:28:08 +00:00
|
|
|
if(${flag_var} MATCHES "/MANIFESTFILE:.*\\.manifest")
|
2024-05-28 22:33:53 +00:00
|
|
|
string(REGEX REPLACE "/MANIFESTFILE:.*\\.manifest" "" ${flag_var}
|
|
|
|
|
"${${flag_var}}")
|
2023-01-11 22:28:08 +00:00
|
|
|
endif()
|
|
|
|
|
endforeach(flag_var)
|
|
|
|
|
|
2019-08-30 14:09:30 +00:00
|
|
|
# Try harder
|
2021-10-11 16:04:07 +00:00
|
|
|
string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /w -w")
|
2022-10-31 21:11:16 +00:00
|
|
|
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " /FS")
|
|
|
|
|
string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /FS")
|
2019-04-03 15:19:45 +00:00
|
|
|
endif(MSVC)
|
|
|
|
|
|
2021-10-11 16:04:07 +00:00
|
|
|
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
|
2020-08-18 16:46:42 +00:00
|
|
|
|
2019-05-01 07:16:13 +00:00
|
|
|
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
|
2024-05-28 22:33:53 +00:00
|
|
|
# applicable to mobile are disabled by this variable. Setting
|
|
|
|
|
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
|
|
|
|
|
# to do mobile build with host toolchain - which is useful for testing purpose.
|
|
|
|
|
if(ANDROID
|
|
|
|
|
OR IOS
|
|
|
|
|
OR DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
|
2019-05-01 07:16:13 +00:00
|
|
|
set(INTERN_BUILD_MOBILE ON)
|
2022-03-22 23:06:04 +00:00
|
|
|
message(WARNING "INTERN_BUILD_MOBILE is on, disabling BUILD_LAZY_TS_BACKEND")
|
|
|
|
|
set(BUILD_LAZY_TS_BACKEND OFF)
|
2018-04-25 01:32:35 +00:00
|
|
|
|
2025-01-23 18:50:58 +00:00
|
|
|
set(USE_KLEIDIAI OFF)
|
|
|
|
|
message(WARNING "KleidiAI cannot be used on Mobile builds. Set it to OFF")
|
|
|
|
|
|
2022-09-08 20:48:19 +00:00
|
|
|
# Set -ffunction-sections and -fdata-sections so that each method has its own
|
|
|
|
|
# text section. This allows the linker to remove unused section when the flag
|
|
|
|
|
# -Wl,-gc-sections is provided at link time.
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -ffunction-sections")
|
|
|
|
|
string(APPEND CMAKE_C_FLAGS " -ffunction-sections")
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -fdata-sections")
|
|
|
|
|
string(APPEND CMAKE_C_FLAGS " -fdata-sections")
|
|
|
|
|
|
|
|
|
|
# Please note that the use of the following flags is required when linking
|
2024-05-28 22:33:53 +00:00
|
|
|
# against libtorch_cpu.a for mobile builds. -Wl,--whole-archive -ltorch_cpu
|
|
|
|
|
# -Wl,--no-whole-archive
|
2022-09-08 20:48:19 +00:00
|
|
|
#
|
2024-05-28 22:33:53 +00:00
|
|
|
# This allows global constructors to be included and run. Global constructors
|
|
|
|
|
# are used for operator/kernel registration with the PyTorch Dispatcher.
|
2022-09-08 20:48:19 +00:00
|
|
|
|
2020-03-25 20:43:00 +00:00
|
|
|
if(DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
|
2020-03-04 19:40:31 +00:00
|
|
|
# C10_MOBILE is derived from Android/iOS toolchain macros in
|
|
|
|
|
# c10/macros/Macros.h, so it needs to be explicitly set here.
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DC10_MOBILE")
|
2020-03-04 19:40:31 +00:00
|
|
|
endif()
|
2021-09-29 19:18:56 +00:00
|
|
|
|
|
|
|
|
if(DEFINED ENV{PYTORCH_MOBILE_TRIM_DISPATCH_KEY_SET})
|
2024-05-28 22:33:53 +00:00
|
|
|
# If PYTORCH_MOBILE_TRIM_DISPATCH_KEY_SET is defined (env var), then define
|
|
|
|
|
# C10_MOBILE_TRIM_DISPATCH_KEYS, which limits the number of dispatch keys in
|
|
|
|
|
# OperatorEntry::dispatchTable_ to reduce peak memory during library
|
|
|
|
|
# initialization.
|
2021-09-29 19:18:56 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DC10_MOBILE_TRIM_DISPATCH_KEYS")
|
|
|
|
|
endif()
|
2019-09-19 02:32:35 +00:00
|
|
|
endif()
|
|
|
|
|
|
2019-05-03 16:23:11 +00:00
|
|
|
# INTERN_BUILD_ATEN_OPS is used to control whether to build ATen/TH operators.
|
2022-09-08 01:49:55 +00:00
|
|
|
set(INTERN_BUILD_ATEN_OPS ON)
|
2019-05-03 16:23:11 +00:00
|
|
|
|
2022-09-08 20:48:19 +00:00
|
|
|
if(NOT DEFINED USE_BLAS)
|
|
|
|
|
set(USE_BLAS ON)
|
|
|
|
|
endif()
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# Build libtorch mobile library, which contains ATen/TH ops and native support
|
|
|
|
|
# for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
|
2022-09-08 01:49:55 +00:00
|
|
|
if(INTERN_BUILD_MOBILE)
|
2020-06-18 18:46:57 +00:00
|
|
|
if(NOT BUILD_SHARED_LIBS AND NOT "${SELECTED_OP_LIST}" STREQUAL "")
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DNO_EXPORT")
|
2019-09-10 17:18:19 +00:00
|
|
|
endif()
|
2020-08-20 19:36:41 +00:00
|
|
|
if(BUILD_MOBILE_AUTOGRAD)
|
|
|
|
|
set(INTERN_DISABLE_AUTOGRAD OFF)
|
|
|
|
|
else()
|
|
|
|
|
set(INTERN_DISABLE_AUTOGRAD ON)
|
|
|
|
|
endif()
|
2019-05-03 16:23:11 +00:00
|
|
|
set(BUILD_PYTHON OFF)
|
2022-09-13 16:36:57 +00:00
|
|
|
set(BUILD_FUNCTORCH OFF)
|
2019-05-03 16:23:11 +00:00
|
|
|
set(USE_DISTRIBUTED OFF)
|
2019-07-31 17:26:58 +00:00
|
|
|
set(NO_API ON)
|
2019-08-23 19:45:51 +00:00
|
|
|
set(USE_FBGEMM OFF)
|
2019-09-06 15:46:01 +00:00
|
|
|
set(INTERN_DISABLE_ONNX ON)
|
2022-09-08 20:48:19 +00:00
|
|
|
if(USE_BLAS)
|
|
|
|
|
set(INTERN_USE_EIGEN_BLAS ON)
|
|
|
|
|
else()
|
|
|
|
|
set(INTERN_USE_EIGEN_BLAS OFF)
|
|
|
|
|
endif()
|
2024-05-28 22:33:53 +00:00
|
|
|
# Disable developing mobile interpreter for actual mobile build. Enable it
|
|
|
|
|
# elsewhere to capture build error.
|
2020-03-04 19:40:31 +00:00
|
|
|
set(INTERN_DISABLE_MOBILE_INTERP ON)
|
2019-05-03 16:23:11 +00:00
|
|
|
endif()
|
|
|
|
|
|
2018-09-26 15:43:38 +00:00
|
|
|
# ---[ Version numbers for generated libraries
|
2020-04-02 18:54:54 +00:00
|
|
|
file(READ version.txt TORCH_DEFAULT_VERSION)
|
2020-04-06 20:17:57 +00:00
|
|
|
# Strip trailing newline
|
|
|
|
|
string(REGEX REPLACE "\n$" "" TORCH_DEFAULT_VERSION "${TORCH_DEFAULT_VERSION}")
|
2020-04-02 18:54:54 +00:00
|
|
|
if("${TORCH_DEFAULT_VERSION} " STREQUAL " ")
|
|
|
|
|
message(WARNING "Could not get version from base 'version.txt'")
|
2024-05-28 22:33:53 +00:00
|
|
|
# If we can't get the version from the version file we should probably set it
|
|
|
|
|
# to something non-sensical like 0.0.0
|
2020-04-02 18:54:54 +00:00
|
|
|
set(TORCH_DEFAULT_VERSION, "0.0.0")
|
|
|
|
|
endif()
|
2024-05-28 22:33:53 +00:00
|
|
|
set(TORCH_BUILD_VERSION
|
|
|
|
|
"${TORCH_DEFAULT_VERSION}"
|
|
|
|
|
CACHE STRING "Torch build version")
|
2020-03-25 20:43:00 +00:00
|
|
|
if(DEFINED ENV{PYTORCH_BUILD_VERSION})
|
2024-05-28 22:33:53 +00:00
|
|
|
set(TORCH_BUILD_VERSION
|
|
|
|
|
"$ENV{PYTORCH_BUILD_VERSION}"
|
|
|
|
|
CACHE STRING "Torch build version" FORCE)
|
2019-05-28 06:41:27 +00:00
|
|
|
endif()
|
2020-03-25 20:43:00 +00:00
|
|
|
if(NOT TORCH_BUILD_VERSION)
|
2018-09-26 15:43:38 +00:00
|
|
|
# An empty string was specified so force version to the default
|
2024-05-28 22:33:53 +00:00
|
|
|
set(TORCH_BUILD_VERSION
|
|
|
|
|
"${TORCH_DEFAULT_VERSION}"
|
|
|
|
|
CACHE STRING "Torch build version" FORCE)
|
2018-09-26 15:43:38 +00:00
|
|
|
endif()
|
|
|
|
|
caffe2_parse_version_str(TORCH ${TORCH_BUILD_VERSION})
|
2020-04-30 13:50:03 +00:00
|
|
|
set(TORCH_SOVERSION "${TORCH_VERSION_MAJOR}.${TORCH_VERSION_MINOR}")
|
2018-09-26 15:43:38 +00:00
|
|
|
|
2017-09-26 15:45:37 +00:00
|
|
|
# ---[ CMake scripts + modules
|
|
|
|
|
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
|
|
|
|
|
|
2017-10-26 19:20:50 +00:00
|
|
|
# ---[ CMake build directories
|
|
|
|
|
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
|
|
|
|
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
|
|
|
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
|
|
|
|
|
2017-09-26 15:45:37 +00:00
|
|
|
enable_testing()
|
|
|
|
|
|
2018-05-24 14:47:27 +00:00
|
|
|
# ---[ Build variables set within the cmake tree
|
|
|
|
|
include(cmake/BuildVariables.cmake)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(CAFFE2_ALLOWLIST
|
|
|
|
|
""
|
|
|
|
|
CACHE STRING "A allowlist file of files that one should build.")
|
2018-05-24 14:47:27 +00:00
|
|
|
|
|
|
|
|
# Set default build type
|
|
|
|
|
if(NOT CMAKE_BUILD_TYPE)
|
2024-05-28 22:33:53 +00:00
|
|
|
message(STATUS "Build type not set - defaulting to Release")
|
|
|
|
|
set(CMAKE_BUILD_TYPE
|
|
|
|
|
"Release"
|
|
|
|
|
CACHE
|
|
|
|
|
STRING
|
|
|
|
|
"Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage."
|
|
|
|
|
FORCE)
|
2018-05-24 14:47:27 +00:00
|
|
|
endif()
|
|
|
|
|
|
2021-01-09 01:23:38 +00:00
|
|
|
# The below means we are cross compiling for arm64 or x86_64 on MacOSX
|
2024-05-28 22:33:53 +00:00
|
|
|
if(NOT IOS
|
|
|
|
|
AND CMAKE_SYSTEM_NAME STREQUAL "Darwin"
|
|
|
|
|
AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
|
2021-01-09 01:23:38 +00:00
|
|
|
set(CROSS_COMPILING_MACOSX TRUE)
|
2024-05-28 22:33:53 +00:00
|
|
|
# We need to compile a universal protoc to not fail protobuf build We set
|
|
|
|
|
# CMAKE_TRY_COMPILE_TARGET_TYPE to STATIC_LIBRARY (vs executable) to succeed
|
|
|
|
|
# the cmake compiler check for cross-compiling
|
|
|
|
|
set(protoc_build_command
|
|
|
|
|
"./scripts/build_host_protoc.sh --other-flags -DCMAKE_OSX_ARCHITECTURES=\"x86_64;arm64\" -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY -DCMAKE_C_COMPILER_WORKS=1 -DCMAKE_CXX_COMPILER_WORKS=1"
|
|
|
|
|
)
|
|
|
|
|
# We write to a temp scriptfile because CMake COMMAND dislikes double quotes
|
|
|
|
|
# in commands
|
|
|
|
|
file(WRITE ${PROJECT_SOURCE_DIR}/tmp_protoc_script.sh
|
|
|
|
|
"#!/bin/bash\n${protoc_build_command}")
|
|
|
|
|
file(
|
|
|
|
|
COPY ${PROJECT_SOURCE_DIR}/tmp_protoc_script.sh
|
|
|
|
|
DESTINATION ${PROJECT_SOURCE_DIR}/scripts/
|
|
|
|
|
FILE_PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ)
|
|
|
|
|
execute_process(
|
|
|
|
|
COMMAND ./scripts/tmp_protoc_script.sh
|
|
|
|
|
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
|
|
|
|
RESULT_VARIABLE BUILD_HOST_PROTOC_RESULT)
|
|
|
|
|
file(REMOVE ${PROJECT_SOURCE_DIR}/tmp_protoc_script.sh
|
|
|
|
|
${PROJECT_SOURCE_DIR}/scripts/tmp_protoc_script.sh)
|
2021-01-09 01:23:38 +00:00
|
|
|
if(NOT BUILD_HOST_PROTOC_RESULT EQUAL "0")
|
|
|
|
|
message(FATAL_ERROR "Could not compile universal protoc.")
|
|
|
|
|
endif()
|
2024-05-28 22:33:53 +00:00
|
|
|
set(PROTOBUF_PROTOC_EXECUTABLE
|
|
|
|
|
"${PROJECT_SOURCE_DIR}/build_host_protoc/bin/protoc")
|
|
|
|
|
set(CAFFE2_CUSTOM_PROTOC_EXECUTABLE
|
|
|
|
|
"${PROJECT_SOURCE_DIR}/build_host_protoc/bin/protoc")
|
2021-01-09 01:23:38 +00:00
|
|
|
endif()
|
|
|
|
|
|
2017-12-21 17:13:31 +00:00
|
|
|
# ---[ Misc checks to cope with various compiler modes
|
|
|
|
|
include(cmake/MiscCheck.cmake)
|
|
|
|
|
|
2017-04-16 23:39:39 +00:00
|
|
|
# External projects
|
|
|
|
|
include(ExternalProject)
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# ---[ Dependencies ---[ FBGEMM doesn't work on x86 32bit and
|
|
|
|
|
# CMAKE_SYSTEM_PROCESSOR thinks its 64bit
|
|
|
|
|
if(USE_FBGEMM
|
|
|
|
|
AND((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL
|
|
|
|
|
4)
|
|
|
|
|
OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86"))
|
2019-03-13 10:43:58 +00:00
|
|
|
set(USE_FBGEMM OFF)
|
|
|
|
|
endif()
|
|
|
|
|
|
2022-05-05 16:57:03 +00:00
|
|
|
set(BUILD_ONEDNN_GRAPH OFF)
|
|
|
|
|
|
2023-09-03 23:44:39 +00:00
|
|
|
if(MSVC)
|
|
|
|
|
# The source code is in utf-8 encoding
|
|
|
|
|
append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS)
|
|
|
|
|
endif()
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# Note for ROCM platform: 1. USE_ROCM is always ON until
|
|
|
|
|
# include(cmake/Dependencies.cmake) 2. USE_CUDA will become OFF during
|
|
|
|
|
# re-configuration Truth Table: CUDA 1st pass: USE_CUDA=True;USE_ROCM=True,
|
|
|
|
|
# FLASH evaluates to ON by default CUDA 2nd pass: USE_CUDA=True;USE_ROCM=False,
|
|
|
|
|
# FLASH evaluates to ON by default ROCM 1st pass: USE_CUDA=True;USE_ROCM=True,
|
|
|
|
|
# FLASH evaluates to ON by default ROCM 2nd pass: USE_CUDA=False;USE_ROCM=True,
|
|
|
|
|
# FLASH evaluates to ON by default CPU 1st pass: USE_CUDA=False(Cmd
|
|
|
|
|
# Option);USE_ROCM=True, FLASH evaluates to OFF by default CPU 2nd pass:
|
|
|
|
|
# USE_CUDA=False(Cmd Option);USE_ROCM=False, FLASH evaluates to OFF by default
|
2024-03-28 00:27:38 +00:00
|
|
|
# Thus we cannot tell ROCM 2nd pass and CPU 1st pass
|
|
|
|
|
#
|
|
|
|
|
# The only solution is to include(cmake/Dependencies.cmake), and defer the
|
|
|
|
|
# aotriton build decision later.
|
|
|
|
|
|
|
|
|
|
include(cmake/Dependencies.cmake)
|
|
|
|
|
|
2022-09-26 20:49:19 +00:00
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_FLASH_ATTENTION
|
2024-01-04 22:21:31 +00:00
|
|
|
"Whether to build the flash_attention kernel for scaled dot product attention.\
|
2024-05-28 22:33:53 +00:00
|
|
|
Will be disabled if not supported by the platform"
|
|
|
|
|
ON
|
2024-07-29 16:49:23 +00:00
|
|
|
"USE_CUDA OR USE_ROCM;NOT MSVC"
|
2024-05-28 22:33:53 +00:00
|
|
|
OFF)
|
2022-09-09 20:11:26 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# We are currenlty not using alibi attention for Flash So we disable this
|
|
|
|
|
# feature by default We dont currently document this feature because we don't
|
2024-03-04 17:36:22 +00:00
|
|
|
# Suspect users building from source will need this
|
|
|
|
|
add_definitions(-DFLASHATTENTION_DISABLE_ALIBI)
|
|
|
|
|
|
2024-06-08 22:41:05 +00:00
|
|
|
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
|
|
|
|
|
# Eff Attention won't
|
2023-08-28 18:39:15 +00:00
|
|
|
cmake_dependent_option(
|
|
|
|
|
USE_MEM_EFF_ATTENTION
|
2024-01-04 22:21:31 +00:00
|
|
|
"Enable memory-efficient attention for scaled dot product attention.\
|
2024-06-08 22:41:05 +00:00
|
|
|
Will be disabled if not supported by the platform" ON
|
|
|
|
|
"USE_CUDA OR USE_ROCM" OFF)
|
2024-01-04 22:21:31 +00:00
|
|
|
|
2024-08-27 18:24:27 +00:00
|
|
|
#
|
|
|
|
|
# Cannot be put into Dependencies.cmake due circular dependency:
|
|
|
|
|
# USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
|
|
|
|
|
#
|
|
|
|
|
if(USE_ROCM)
|
2024-12-03 03:25:59 +00:00
|
|
|
if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
|
2024-08-27 18:24:27 +00:00
|
|
|
include(cmake/External/aotriton.cmake)
|
|
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
|
2023-07-05 17:08:16 +00:00
|
|
|
if(DEBUG_CUDA)
|
2023-06-01 23:11:07 +00:00
|
|
|
string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo")
|
|
|
|
|
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -lineinfo")
|
2024-05-28 22:33:53 +00:00
|
|
|
# CUDA-12.1 crashes when trying to compile with --source-in-ptx See
|
|
|
|
|
# https://github.com/pytorch/pytorch/issues/102372#issuecomment-1572526893
|
2023-06-01 23:11:07 +00:00
|
|
|
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.1)
|
|
|
|
|
string(APPEND CMAKE_CUDA_FLAGS_DEBUG " --source-in-ptx")
|
|
|
|
|
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " --source-in-ptx")
|
|
|
|
|
endif()
|
2023-07-06 22:13:29 +00:00
|
|
|
endif(DEBUG_CUDA)
|
2023-06-01 23:11:07 +00:00
|
|
|
|
2018-12-21 18:32:57 +00:00
|
|
|
if(USE_FBGEMM)
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
|
2018-12-21 18:32:57 +00:00
|
|
|
endif()
|
|
|
|
|
|
2019-09-17 03:48:47 +00:00
|
|
|
if(USE_PYTORCH_QNNPACK)
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
|
2019-09-17 03:48:47 +00:00
|
|
|
endif()
|
|
|
|
|
|
2024-09-20 16:02:32 +00:00
|
|
|
# Enable sleef on macOS with Apple silicon by default
|
|
|
|
|
if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64"))
|
|
|
|
|
message(STATUS "Running on macOS with Apple silicon")
|
2022-02-07 21:45:59 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
|
2024-06-03 19:33:06 +00:00
|
|
|
add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
|
|
|
|
|
endif()
|
|
|
|
|
|
2024-09-20 16:02:32 +00:00
|
|
|
# Enable sleef on Arm(R) architecture by default (except Android)
|
|
|
|
|
if((NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
|
|
|
|
|
AND("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64"))
|
2024-06-03 19:33:06 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
|
|
|
|
|
add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
|
2022-02-07 21:45:59 +00:00
|
|
|
endif()
|
|
|
|
|
|
2024-09-20 16:02:32 +00:00
|
|
|
|
Mobile Backend: NHWC memory layout + XNNPACK integration. (#33722)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33722
In order to improve CPU performance on floating-point models on mobile, this PR introduces a new CPU backend for mobile that implements the most common mobile operators with NHWC memory layout support through integration with XNNPACK.
XNNPACK itself, and this codepath, are currently only included in the build, but the actual integration is gated with USE_XNNPACK preprocessor guards. This preprocessor symbol is intentionally not passed on to the compiler, so as to enable this rollout in multiple stages in follow up PRs. This changeset will build XNNPACK as part of the build if the identically named USE_XNNPACK CMAKE variable, defaulted to ON, is enabled, but will not actually expose or enable this code path in any other way.
Furthermore, it is worth pointing out that in order to efficiently map models to these operators, some front-end method of exposing this backend to the user is needed. The less efficient implementation would be to hook these operators into their corresponding native implementations, granted that a series of XNNPACK-specific conditions are met, much like how NNPACK is integrated with PyTorch today for instance.
Having said that, while the above implementation is still expected to outperform NNPACK based on the benchmarks I ran, the above integration would be leave a considerable gap between the performance achieved and the maximum performance potential XNNPACK enables, as it does not provide a way to compute and factor out one-time operations out of the inner most forward() loop.
The more optimal solution, and one we will decide on soon, would involve either providing a JIT pass that maps nn operators onto these newly introduced operators, while allowing one-time calculations to be factored out, much like quantized mobile models. Alternatively, new eager-mode modules can also be introduced that would directly call into these implementations either through c10 or some other mechanism, also allowing for decoupling of op creation from op execution.
This PR does not include any of the front end changes mentioned above. Neither does it include the mobile threadpool unification present in the original https://github.com/pytorch/pytorch/issues/30644. Furthermore, this codepath seems to be faster than NNPACK in a good number of use cases, which can potentially allow us to remove NNPACK from aten to make the codebase a little simpler, granted that there is widespread support for such a move.
Regardless, these changes will be introduced gradually and in a more controlled way in subsequent PRs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/32509
Test Plan:
Build: CI
Functionality: Not exposed
Reviewed By: dreiss
Differential Revision: D20069796
Pulled By: AshkanAliabadi
fbshipit-source-id: d46c1c91d4bea91979ea5bd46971ced5417d309c
2020-02-25 05:53:34 +00:00
|
|
|
if(USE_XNNPACK)
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
|
Mobile Backend: NHWC memory layout + XNNPACK integration. (#33722)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33722
In order to improve CPU performance on floating-point models on mobile, this PR introduces a new CPU backend for mobile that implements the most common mobile operators with NHWC memory layout support through integration with XNNPACK.
XNNPACK itself, and this codepath, are currently only included in the build, but the actual integration is gated with USE_XNNPACK preprocessor guards. This preprocessor symbol is intentionally not passed on to the compiler, so as to enable this rollout in multiple stages in follow up PRs. This changeset will build XNNPACK as part of the build if the identically named USE_XNNPACK CMAKE variable, defaulted to ON, is enabled, but will not actually expose or enable this code path in any other way.
Furthermore, it is worth pointing out that in order to efficiently map models to these operators, some front-end method of exposing this backend to the user is needed. The less efficient implementation would be to hook these operators into their corresponding native implementations, granted that a series of XNNPACK-specific conditions are met, much like how NNPACK is integrated with PyTorch today for instance.
Having said that, while the above implementation is still expected to outperform NNPACK based on the benchmarks I ran, the above integration would be leave a considerable gap between the performance achieved and the maximum performance potential XNNPACK enables, as it does not provide a way to compute and factor out one-time operations out of the inner most forward() loop.
The more optimal solution, and one we will decide on soon, would involve either providing a JIT pass that maps nn operators onto these newly introduced operators, while allowing one-time calculations to be factored out, much like quantized mobile models. Alternatively, new eager-mode modules can also be introduced that would directly call into these implementations either through c10 or some other mechanism, also allowing for decoupling of op creation from op execution.
This PR does not include any of the front end changes mentioned above. Neither does it include the mobile threadpool unification present in the original https://github.com/pytorch/pytorch/issues/30644. Furthermore, this codepath seems to be faster than NNPACK in a good number of use cases, which can potentially allow us to remove NNPACK from aten to make the codebase a little simpler, granted that there is widespread support for such a move.
Regardless, these changes will be introduced gradually and in a more controlled way in subsequent PRs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/32509
Test Plan:
Build: CI
Functionality: Not exposed
Reviewed By: dreiss
Differential Revision: D20069796
Pulled By: AshkanAliabadi
fbshipit-source-id: d46c1c91d4bea91979ea5bd46971ced5417d309c
2020-02-25 05:53:34 +00:00
|
|
|
endif()
|
|
|
|
|
|
[Mobile GPU][Integration] Vulkan backend integration (#36491)
Summary:
This PR contains the initial version of Vulkan (GPU) Backend integration.
The primary target environment is Android, but the desktop build is also supported.
## CMake
Introducing three cmake options:
USE_VULKAN:
The main switch, if it is off, all other options do not affect.
USE_VULKAN_WRAPPER:
ON - Vulkan will be used loading it at runtime as "libvulkan.so" using libdl, every function call is wrapped in vulkan_wrapper.h.
OFF - linking with libvulkan.so directly
USE_VULKAN_SHADERC_RUNTIME:
ON - Shader compilation library will be linked, and shaders will be compiled runtime.
OFF - Shaders will be precompiled and shader compilation library is not included.
## Codegen
if `USE_VULKAN_SHADERC_RUNTIME` is ON:
Shaders precompilation () starts in cmake/VulkanCodegen.cmake, which calls `aten/src/ATen/native/vulkan/gen_glsl.py` or `aten/src/ATen/native/vulkan/gen_spv.py` to include shaders source or SPIR-V bytecode inside binary as uint32_t array in spv.h,spv.cpp.
if `USE_VULKAN_SHADERC_RUNTIME` is OFF:
The source of shaders is included as `glsl.h`,`glsl.cpp`.
All codegen results happen in the build directory.
## Build dependencies
cmake/Dependencies.cmake
If the target platform is Android - vulkan library, headers, Vulkan wrapper will be used from ANDROID_NDK.
Desktop build requires the VULKAN_SDK environment variable, and all vulkan dependencies will be used from it.
(Desktop build was tested only on Linux).
## Pytorch integration:
Adding 'Vulkan" as new Backend, DispatchKey, DeviceType.
We are using Strided layout without supporting strides at the moment, but we plan to support them in the future.
Using OpaqueTensorImpl where OpaqueHandle is copyable VulkanTensor,
more details in comments in `aten/src/ATen/native/vulkan/Vulkan.h`
Main code location: `aten/src/ATen/native/vulkan`
`aten/src/ATen/native/vulkan/VulkanAten.cpp` - connection link between ATen and Vulkan api (Vulkan.h) that converts at::Tensor to VulkanTensor.
`aten/src/ATen/native/Vulkan/Vulkan.h` - Vulkan API that contains VulkanTensor representation and functions to work with it. Plan to expose it for clients to be able to write their own Vulkan Ops.
`aten/src/ATen/native/vulkan/VulkanOps.cpp` - Vulkan Operations Implementations that uses Vulkan.h API
## GLSL shaders
Located in `aten/src/ATen/native/vulkan/glsl` as *.glsl files.
All shaders use Vulkan specialized constants for workgroup sizes with ids 1, 2, 3
## Supported operations
Code point:
conv2d no-groups
conv2d depthwise
addmm
upsample nearest 2d
clamp
hardtanh
## Testing
`aten/src/ATen/test/vulkan_test.cpp` - contains tests for
copy from CPU to Vulkan and back
all supported operations
Desktop builds supported, and testing can be done on a desktop that has Vulkan supported GPU or with installed software implementation of Vulkan, like https://github.com/google/swiftshader
## Vulkan execution
The initial implementation is trivial and waits every operator's execution.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/36491
Differential Revision: D21696709
Pulled By: IvanKobzarev
fbshipit-source-id: da3e5a770b1a1995e9465d7e81963e7de56217fa
2020-05-26 02:10:31 +00:00
|
|
|
if(USE_VULKAN)
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN")
|
2020-10-21 01:36:39 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_API")
|
|
|
|
|
|
2020-11-30 22:21:44 +00:00
|
|
|
if(USE_VULKAN_FP16_INFERENCE)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_FP16_INFERENCE")
|
|
|
|
|
endif()
|
[Mobile GPU][Integration] Vulkan backend integration (#36491)
Summary:
This PR contains the initial version of Vulkan (GPU) Backend integration.
The primary target environment is Android, but the desktop build is also supported.
## CMake
Introducing three cmake options:
USE_VULKAN:
The main switch, if it is off, all other options do not affect.
USE_VULKAN_WRAPPER:
ON - Vulkan will be used loading it at runtime as "libvulkan.so" using libdl, every function call is wrapped in vulkan_wrapper.h.
OFF - linking with libvulkan.so directly
USE_VULKAN_SHADERC_RUNTIME:
ON - Shader compilation library will be linked, and shaders will be compiled runtime.
OFF - Shaders will be precompiled and shader compilation library is not included.
## Codegen
if `USE_VULKAN_SHADERC_RUNTIME` is ON:
Shaders precompilation () starts in cmake/VulkanCodegen.cmake, which calls `aten/src/ATen/native/vulkan/gen_glsl.py` or `aten/src/ATen/native/vulkan/gen_spv.py` to include shaders source or SPIR-V bytecode inside binary as uint32_t array in spv.h,spv.cpp.
if `USE_VULKAN_SHADERC_RUNTIME` is OFF:
The source of shaders is included as `glsl.h`,`glsl.cpp`.
All codegen results happen in the build directory.
## Build dependencies
cmake/Dependencies.cmake
If the target platform is Android - vulkan library, headers, Vulkan wrapper will be used from ANDROID_NDK.
Desktop build requires the VULKAN_SDK environment variable, and all vulkan dependencies will be used from it.
(Desktop build was tested only on Linux).
## Pytorch integration:
Adding 'Vulkan" as new Backend, DispatchKey, DeviceType.
We are using Strided layout without supporting strides at the moment, but we plan to support them in the future.
Using OpaqueTensorImpl where OpaqueHandle is copyable VulkanTensor,
more details in comments in `aten/src/ATen/native/vulkan/Vulkan.h`
Main code location: `aten/src/ATen/native/vulkan`
`aten/src/ATen/native/vulkan/VulkanAten.cpp` - connection link between ATen and Vulkan api (Vulkan.h) that converts at::Tensor to VulkanTensor.
`aten/src/ATen/native/Vulkan/Vulkan.h` - Vulkan API that contains VulkanTensor representation and functions to work with it. Plan to expose it for clients to be able to write their own Vulkan Ops.
`aten/src/ATen/native/vulkan/VulkanOps.cpp` - Vulkan Operations Implementations that uses Vulkan.h API
## GLSL shaders
Located in `aten/src/ATen/native/vulkan/glsl` as *.glsl files.
All shaders use Vulkan specialized constants for workgroup sizes with ids 1, 2, 3
## Supported operations
Code point:
conv2d no-groups
conv2d depthwise
addmm
upsample nearest 2d
clamp
hardtanh
## Testing
`aten/src/ATen/test/vulkan_test.cpp` - contains tests for
copy from CPU to Vulkan and back
all supported operations
Desktop builds supported, and testing can be done on a desktop that has Vulkan supported GPU or with installed software implementation of Vulkan, like https://github.com/google/swiftshader
## Vulkan execution
The initial implementation is trivial and waits every operator's execution.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/36491
Differential Revision: D21696709
Pulled By: IvanKobzarev
fbshipit-source-id: da3e5a770b1a1995e9465d7e81963e7de56217fa
2020-05-26 02:10:31 +00:00
|
|
|
|
2020-11-30 22:21:44 +00:00
|
|
|
if(USE_VULKAN_RELAXED_PRECISION)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_RELAXED_PRECISION")
|
|
|
|
|
endif()
|
[Mobile GPU][Integration] Vulkan backend integration (#36491)
Summary:
This PR contains the initial version of Vulkan (GPU) Backend integration.
The primary target environment is Android, but the desktop build is also supported.
## CMake
Introducing three cmake options:
USE_VULKAN:
The main switch, if it is off, all other options do not affect.
USE_VULKAN_WRAPPER:
ON - Vulkan will be used loading it at runtime as "libvulkan.so" using libdl, every function call is wrapped in vulkan_wrapper.h.
OFF - linking with libvulkan.so directly
USE_VULKAN_SHADERC_RUNTIME:
ON - Shader compilation library will be linked, and shaders will be compiled runtime.
OFF - Shaders will be precompiled and shader compilation library is not included.
## Codegen
if `USE_VULKAN_SHADERC_RUNTIME` is ON:
Shaders precompilation () starts in cmake/VulkanCodegen.cmake, which calls `aten/src/ATen/native/vulkan/gen_glsl.py` or `aten/src/ATen/native/vulkan/gen_spv.py` to include shaders source or SPIR-V bytecode inside binary as uint32_t array in spv.h,spv.cpp.
if `USE_VULKAN_SHADERC_RUNTIME` is OFF:
The source of shaders is included as `glsl.h`,`glsl.cpp`.
All codegen results happen in the build directory.
## Build dependencies
cmake/Dependencies.cmake
If the target platform is Android - vulkan library, headers, Vulkan wrapper will be used from ANDROID_NDK.
Desktop build requires the VULKAN_SDK environment variable, and all vulkan dependencies will be used from it.
(Desktop build was tested only on Linux).
## Pytorch integration:
Adding 'Vulkan" as new Backend, DispatchKey, DeviceType.
We are using Strided layout without supporting strides at the moment, but we plan to support them in the future.
Using OpaqueTensorImpl where OpaqueHandle is copyable VulkanTensor,
more details in comments in `aten/src/ATen/native/vulkan/Vulkan.h`
Main code location: `aten/src/ATen/native/vulkan`
`aten/src/ATen/native/vulkan/VulkanAten.cpp` - connection link between ATen and Vulkan api (Vulkan.h) that converts at::Tensor to VulkanTensor.
`aten/src/ATen/native/Vulkan/Vulkan.h` - Vulkan API that contains VulkanTensor representation and functions to work with it. Plan to expose it for clients to be able to write their own Vulkan Ops.
`aten/src/ATen/native/vulkan/VulkanOps.cpp` - Vulkan Operations Implementations that uses Vulkan.h API
## GLSL shaders
Located in `aten/src/ATen/native/vulkan/glsl` as *.glsl files.
All shaders use Vulkan specialized constants for workgroup sizes with ids 1, 2, 3
## Supported operations
Code point:
conv2d no-groups
conv2d depthwise
addmm
upsample nearest 2d
clamp
hardtanh
## Testing
`aten/src/ATen/test/vulkan_test.cpp` - contains tests for
copy from CPU to Vulkan and back
all supported operations
Desktop builds supported, and testing can be done on a desktop that has Vulkan supported GPU or with installed software implementation of Vulkan, like https://github.com/google/swiftshader
## Vulkan execution
The initial implementation is trivial and waits every operator's execution.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/36491
Differential Revision: D21696709
Pulled By: IvanKobzarev
fbshipit-source-id: da3e5a770b1a1995e9465d7e81963e7de56217fa
2020-05-26 02:10:31 +00:00
|
|
|
|
2020-09-16 19:45:08 +00:00
|
|
|
endif()
|
|
|
|
|
|
2021-04-07 18:37:56 +00:00
|
|
|
if(BUILD_LITE_INTERPRETER)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DBUILD_LITE_INTERPRETER")
|
|
|
|
|
endif()
|
|
|
|
|
|
2021-10-15 09:17:57 +00:00
|
|
|
if(TRACING_BASED)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DTRACING_BASED")
|
|
|
|
|
endif()
|
|
|
|
|
|
2020-10-17 17:22:55 +00:00
|
|
|
if(USE_PYTORCH_METAL)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL")
|
|
|
|
|
endif()
|
|
|
|
|
|
2021-08-27 16:23:45 +00:00
|
|
|
if(USE_PYTORCH_METAL_EXPORT)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL_EXPORT")
|
|
|
|
|
endif()
|
|
|
|
|
|
[PyTorch, Mobile] Serialization format change for source range (#54284)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/54284
In order to bring mobile deployment, via lite interpreter, on feature
parity with JIT, with respect model level debug information we must make
model level debug information available to mobile runtime.
At the moment, model level debug information is stored in SourceRange
which associates node's of graph to where the come from in original
python source code.
This information is serialized as part of debug_pkl and deserialized
when JIT loads the model and reads the model code.
On lite interpreter, we do not have access to all the functionality of
JIT and hence we cannot load model in the same way as JIT, by reading
code, constructing module hierarchy and graph corresponding module
methods etc. Instead in, lite interpreter, only bytecode corresonding to
the compiled graph, Code, is saved.
Thus in order to annotate OPs in the bytecode with equivalent
SourceRange information we do the following:
1. During model serialization, we create a unique tag for each source
range of the model.
2. Create a map of <SourceRange, tag>
3. During debug_pkl serialization we save tag along with SourceRange, on
top of byte offset.
4. During bytecode generation, the methods of the top module are
lowered. During this process methods are inlined. In the inlined graph,
when the node of a graph is lowered to bytecode, we query node's source
range and look it up against the map.
5. Resulting source range tag is serialized in module_debug_info.
6. During model deserialization, we read all the debug_pkl records in
the archieve and create a map of <tag, SourceRange>
7. This map can be used to find source code information.
During mobile runtime:
1. We read all the debug_pkl records and create <tag=debug_handle,
SourceRange> map.
1.1 This map, MobileDebugInfo, is a member of mobile Module.
2. Interpreter catches appropriate exceptions and sets the thread local
debug handle and rethrows the exception.
3. In Function's run method we catch exception and query current debug
handle where the exception happened.
4. Query MobileDebugInfo with debug handle to retrieve source range and
augment error with source range info.
This information is still incomplete as it does not contain entire
callstack.
In the following diffs we will serialize InlinedCallStack directly.
Note that compilation is gated by SYMBOLICATE_MOBILE_DEBUG_HANDLE macro,
so that mobile builds can avoid building MobileDebugInfo, source range
and source range pickler/unpickler. Later we will add path where, if
building without debug support stack trace will contain only debug
handles. They can be symbolicated later.
Test Plan:
Ported bunch of source range tests from test_jit.py. Added on more test
in test_lite_interpreter.py
Imported from OSS
Reviewed By: raziel
Differential Revision: D27174722
fbshipit-source-id: a7b7c6088ce16dec37e823c7fefa4f0b61047e12
2021-05-04 16:17:43 +00:00
|
|
|
if(USE_SOURCE_DEBUG_ON_MOBILE)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
|
|
|
|
|
endif()
|
|
|
|
|
|
2023-01-27 08:58:03 +00:00
|
|
|
if(BUILD_LITE_INTERPRETER AND USE_LITE_INTERPRETER_PROFILER)
|
2021-08-14 04:37:57 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DEDGE_PROFILER_USE_KINETO")
|
|
|
|
|
endif()
|
|
|
|
|
|
2021-09-17 17:14:40 +00:00
|
|
|
if(USE_COREML_DELEGATE)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_COREML_DELEGATE")
|
|
|
|
|
endif()
|
|
|
|
|
|
2020-07-28 14:53:15 +00:00
|
|
|
# ---[ Allowlist file if allowlist is specified
|
|
|
|
|
include(cmake/Allowlist.cmake)
|
2017-03-29 15:45:19 +00:00
|
|
|
|
2017-03-15 18:31:55 +00:00
|
|
|
# ---[ Set link flag, handle additional deps for gcc 4.8 and above
|
2023-11-06 17:19:53 +00:00
|
|
|
if(CMAKE_COMPILER_IS_GNUCXX AND NOT ANDROID)
|
2024-05-28 22:33:53 +00:00
|
|
|
message(
|
|
|
|
|
STATUS
|
|
|
|
|
"GCC ${CMAKE_CXX_COMPILER_VERSION}: Adding gcc and gcc_s libs to link line"
|
|
|
|
|
)
|
2017-01-12 00:51:02 +00:00
|
|
|
list(APPEND Caffe2_DEPENDENCY_LIBS gcc_s gcc)
|
2016-12-12 17:29:00 +00:00
|
|
|
endif()
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# ---[ Build flags Re-include to override append_cxx_flag_if_supported from
|
|
|
|
|
# third_party/FBGEMM
|
2023-09-11 08:32:07 +00:00
|
|
|
include(cmake/public/utils.cmake)
|
Re-apply windows diff D4657831
Summary:
(Note: previous revert was due to a race condition between D4657831 and
D4659953 that I failed to catch.)
After this, we should have contbuild guarding the Windows build both with
and without CUDA.
This includes a series of changes that are needed to make Windows build,
specifically:
(1) Various flags that are needed in the cmake system, specially dealing
with /MD, /MT, cuda, cudnn, whole static linking, etc.
(2) Contbuild scripts based on appveyo.
(3) For Windows build, note that one will need to use "cmake --build" to
build stuff so that the build type is consistent between configuration and
actual build. see scripts\build_windows.bat for details.
(4) In logging.h, ERROR is already defined by Windows. I don't have a good
solution now, and as a result, LOG(ERROR) on windows is going to be
LOG(INFO).
(5) variable length array is not supported by MSVC (and it is not part of
C++ standard). As a result I replaced them with vectors.
(6) sched.h is not available on Windows, so akyrola 's awesome simple
async net might encounter some slowdown due to no affinity setting on
Windows.
(7) MSVC has a bug that does not work very well with template calls inide
a templated function call, which is a known issue that should be fixed in
MSVC 2017. However for now this means changes to conv_op_impl.h and
recurrent_net_op.h. No actual functionalities are changed.
(8) std host function calls are not supported in CUDA8+MSVC, so I changed
lp_pool (and maybe a few others) to use cuda device functions.
(9) The current Scale and Axpy has heavy templating that does not work
well with MSVC. As a result I reverted azzolini 's changes to the Scale
and Axpy interface, moved the fixed-length version to ScaleFixedSize and
AxpyFixedSize.
(10) CUDA + MSVC does not deal with Eigen well, so I guarded all Eigen
parts to only the non-CUDA part.
(11) In conclusion, it is fun but painful to deal with visual c++.
Differential Revision: D4666745
fbshipit-source-id: 3c9035083067bdb19a16d9c345c1ce66b6a86600
2017-03-07 18:56:26 +00:00
|
|
|
if(NOT MSVC)
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -O2 -fPIC")
|
2018-01-16 22:33:11 +00:00
|
|
|
# Eigen fails to build with some versions, so convert this to a warning
|
|
|
|
|
# Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -Wall")
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -Wextra")
|
2022-08-10 14:32:25 +00:00
|
|
|
append_cxx_flag_if_supported("-Werror=return-type" CMAKE_CXX_FLAGS)
|
2022-09-26 21:35:00 +00:00
|
|
|
append_cxx_flag_if_supported("-Werror=non-virtual-dtor" CMAKE_CXX_FLAGS)
|
2022-10-14 22:34:33 +00:00
|
|
|
append_cxx_flag_if_supported("-Werror=braced-scalar-init" CMAKE_CXX_FLAGS)
|
2022-10-19 20:56:37 +00:00
|
|
|
append_cxx_flag_if_supported("-Werror=range-loop-construct" CMAKE_CXX_FLAGS)
|
2023-01-15 17:08:02 +00:00
|
|
|
append_cxx_flag_if_supported("-Werror=bool-operation" CMAKE_CXX_FLAGS)
|
2022-11-17 21:20:45 +00:00
|
|
|
append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS)
|
2024-05-28 22:33:53 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-missing-field-initializers"
|
|
|
|
|
CMAKE_CXX_FLAGS)
|
2022-08-10 14:32:25 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
|
|
|
|
|
append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS)
|
|
|
|
|
append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS)
|
|
|
|
|
append_cxx_flag_if_supported("-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
|
2023-11-06 17:19:53 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-stringop-overflow" CMAKE_CXX_FLAGS)
|
2022-09-28 17:12:25 +00:00
|
|
|
append_cxx_flag_if_supported("-Wvla-extension" CMAKE_CXX_FLAGS)
|
2023-11-06 17:19:53 +00:00
|
|
|
append_cxx_flag_if_supported("-Wsuggest-override" CMAKE_CXX_FLAGS)
|
2023-04-21 14:46:47 +00:00
|
|
|
append_cxx_flag_if_supported("-Wnewline-eof" CMAKE_CXX_FLAGS)
|
2024-05-28 22:33:53 +00:00
|
|
|
append_cxx_flag_if_supported("-Winconsistent-missing-override"
|
|
|
|
|
CMAKE_CXX_FLAGS)
|
|
|
|
|
append_cxx_flag_if_supported("-Winconsistent-missing-destructor-override"
|
|
|
|
|
CMAKE_CXX_FLAGS)
|
2021-12-11 00:43:41 +00:00
|
|
|
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -Wno-pass-failed")
|
|
|
|
|
endif()
|
2020-07-14 16:10:08 +00:00
|
|
|
if(CMAKE_COMPILER_IS_GNUCXX)
|
2024-05-28 22:33:53 +00:00
|
|
|
# Suppress "The ABI for passing parameters with 64-byte alignment has
|
|
|
|
|
# changed in GCC 4.6"
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -Wno-psabi")
|
2020-07-14 16:10:08 +00:00
|
|
|
endif()
|
2021-05-03 17:03:07 +00:00
|
|
|
|
|
|
|
|
# Use ld.gold if available, fall back to ld.bfd (the default ld) if not
|
|
|
|
|
if(USE_GOLD_LINKER)
|
|
|
|
|
if(USE_DISTRIBUTED AND USE_MPI)
|
|
|
|
|
# Same issue as here with default MPI on Ubuntu
|
|
|
|
|
# https://bugs.launchpad.net/ubuntu/+source/deal.ii/+bug/1841577
|
|
|
|
|
message(WARNING "Refusing to use gold when USE_MPI=1")
|
|
|
|
|
else()
|
|
|
|
|
execute_process(
|
2024-05-28 22:33:53 +00:00
|
|
|
COMMAND "${CMAKE_C_COMPILER}" -fuse-ld=gold -Wl,--version
|
|
|
|
|
ERROR_QUIET
|
|
|
|
|
OUTPUT_VARIABLE LD_VERSION)
|
2021-05-03 17:03:07 +00:00
|
|
|
if(NOT "${LD_VERSION}" MATCHES "GNU gold")
|
2024-05-28 22:33:53 +00:00
|
|
|
message(
|
|
|
|
|
WARNING
|
|
|
|
|
"USE_GOLD_LINKER was set but ld.gold isn't available, turning it off"
|
|
|
|
|
)
|
2021-05-03 17:03:07 +00:00
|
|
|
set(USE_GOLD_LINKER OFF)
|
|
|
|
|
else()
|
|
|
|
|
message(STATUS "ld.gold is available, using it to link")
|
|
|
|
|
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold")
|
2024-05-28 22:33:53 +00:00
|
|
|
set(CMAKE_SHARED_LINKER_FLAGS
|
|
|
|
|
"${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold")
|
|
|
|
|
set(CMAKE_MODULE_LINKER_FLAGS
|
|
|
|
|
"${CMAKE_MODULE_LINKER_FLAGS} -fuse-ld=gold")
|
2021-05-03 17:03:07 +00:00
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
|
2022-08-10 14:32:25 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-error=old-style-cast" CMAKE_CXX_FLAGS)
|
2023-03-27 18:46:09 +00:00
|
|
|
append_cxx_flag_if_supported("-Wconstant-conversion" CMAKE_CXX_FLAGS)
|
2024-05-28 22:33:53 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-aligned-allocation-unavailable"
|
|
|
|
|
CMAKE_CXX_FLAGS)
|
2023-03-27 18:46:09 +00:00
|
|
|
append_cxx_flag_if_supported("-Qunused-arguments" CMAKE_CXX_FLAGS)
|
2022-08-10 14:32:25 +00:00
|
|
|
|
2022-08-23 01:09:29 +00:00
|
|
|
if(${USE_COLORIZE_OUTPUT})
|
2024-05-28 22:33:53 +00:00
|
|
|
# Why compiler checks are necessary even when `try_compile` is used Because
|
|
|
|
|
# of the bug in ccache that can incorrectly identify `-fcolor-diagnostics`
|
|
|
|
|
# As supported by GCC, see https://github.com/ccache/ccache/issues/740 (for
|
|
|
|
|
# older ccache) and https://github.com/ccache/ccache/issues/1275 (for newer
|
|
|
|
|
# ones)
|
2023-04-12 23:39:32 +00:00
|
|
|
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
|
|
|
|
append_cxx_flag_if_supported("-fdiagnostics-color=always" CMAKE_CXX_FLAGS)
|
|
|
|
|
else()
|
|
|
|
|
append_cxx_flag_if_supported("-fcolor-diagnostics" CMAKE_CXX_FLAGS)
|
|
|
|
|
endif()
|
2018-06-26 15:09:25 +00:00
|
|
|
endif()
|
2022-08-10 14:32:25 +00:00
|
|
|
|
2023-03-30 03:16:15 +00:00
|
|
|
append_cxx_flag_if_supported("-faligned-new" CMAKE_CXX_FLAGS)
|
|
|
|
|
|
2020-03-25 20:43:00 +00:00
|
|
|
if(WERROR)
|
2023-03-27 18:46:09 +00:00
|
|
|
append_cxx_flag_if_supported("-Werror" CMAKE_CXX_FLAGS)
|
2020-03-25 20:43:00 +00:00
|
|
|
if(NOT COMPILER_SUPPORT_WERROR)
|
2019-01-29 04:51:52 +00:00
|
|
|
set(WERROR FALSE)
|
|
|
|
|
endif()
|
2018-08-31 20:08:20 +00:00
|
|
|
endif()
|
2023-03-27 18:46:09 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
|
2023-07-11 01:29:16 +00:00
|
|
|
append_cxx_flag_if_supported("-fstandalone-debug" CMAKE_CXX_FLAGS_DEBUG)
|
2024-09-30 21:11:48 +00:00
|
|
|
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
|
|
|
|
|
if(CMAKE_BUILD_TYPE MATCHES Debug)
|
|
|
|
|
message(Warning "Applying -Og optimization for aarch64 GCC debug build to workaround ICE")
|
|
|
|
|
endif()
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -Og")
|
|
|
|
|
string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -Og")
|
|
|
|
|
else()
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
|
|
|
|
|
string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
|
|
|
|
|
endif()
|
2022-08-10 14:32:25 +00:00
|
|
|
append_cxx_flag_if_supported("-fno-math-errno" CMAKE_CXX_FLAGS)
|
|
|
|
|
append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
|
|
|
|
|
append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
|
2024-10-19 01:18:40 +00:00
|
|
|
if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
|
2024-12-12 04:02:12 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-dangling-reference" CMAKE_CXX_FLAGS)
|
2024-10-19 01:18:40 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-error=dangling-reference" CMAKE_CXX_FLAGS)
|
|
|
|
|
append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS)
|
|
|
|
|
endif()
|
2023-02-27 19:22:16 +00:00
|
|
|
else()
|
2025-01-15 23:43:41 +00:00
|
|
|
# Define export functions for AOTI.
|
|
|
|
|
add_compile_definitions(EXPORT_AOTI_FUNCTIONS)
|
|
|
|
|
|
2023-02-27 19:22:16 +00:00
|
|
|
# skip unwanted includes from windows.h
|
|
|
|
|
add_compile_definitions(WIN32_LEAN_AND_MEAN)
|
|
|
|
|
# Windows SDK broke compatibility since version 25131, but introduced this
|
|
|
|
|
# define for backward compatibility.
|
|
|
|
|
add_compile_definitions(_UCRT_LEGACY_INFINITY)
|
|
|
|
|
# disable min/max macros
|
|
|
|
|
add_compile_definitions(NOMINMAX)
|
2024-05-28 22:33:53 +00:00
|
|
|
# Turn off these warnings on Windows. destructor was implicitly defined as
|
|
|
|
|
# delete
|
2023-02-27 19:22:16 +00:00
|
|
|
append_cxx_flag_if_supported("/wd4624" CMAKE_CXX_FLAGS)
|
|
|
|
|
# unknown pragma
|
|
|
|
|
append_cxx_flag_if_supported("/wd4068" CMAKE_CXX_FLAGS)
|
|
|
|
|
# unexpected tokens following preprocessor directive - expected a newline
|
|
|
|
|
append_cxx_flag_if_supported("/wd4067" CMAKE_CXX_FLAGS)
|
|
|
|
|
# conversion from 'size_t' to 'unsigned int', possible loss of data
|
|
|
|
|
append_cxx_flag_if_supported("/wd4267" CMAKE_CXX_FLAGS)
|
|
|
|
|
# no suitable definition provided for explicit template instantiation request
|
|
|
|
|
append_cxx_flag_if_supported("/wd4661" CMAKE_CXX_FLAGS)
|
|
|
|
|
# recursive on all control paths, function will cause runtime stack overflow
|
|
|
|
|
append_cxx_flag_if_supported("/wd4717" CMAKE_CXX_FLAGS)
|
|
|
|
|
# conversion from '_Ty' to '_Ty', possible loss of data
|
|
|
|
|
append_cxx_flag_if_supported("/wd4244" CMAKE_CXX_FLAGS)
|
|
|
|
|
# unsafe use of type 'bool' in operation
|
|
|
|
|
append_cxx_flag_if_supported("/wd4804" CMAKE_CXX_FLAGS)
|
|
|
|
|
# inconsistent dll linkage
|
|
|
|
|
append_cxx_flag_if_supported("/wd4273" CMAKE_CXX_FLAGS)
|
2017-02-13 17:42:48 +00:00
|
|
|
endif()
|
2017-01-05 04:36:11 +00:00
|
|
|
|
2020-09-11 23:00:24 +00:00
|
|
|
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
|
|
|
|
include(CheckCSourceCompiles)
|
2024-05-28 22:33:53 +00:00
|
|
|
check_c_source_compiles(
|
|
|
|
|
"#include <arm_neon.h>
|
2020-09-11 23:00:24 +00:00
|
|
|
int main() {
|
2020-09-14 19:17:00 +00:00
|
|
|
float a[] = {1.0, 1.0};
|
2020-09-11 23:00:24 +00:00
|
|
|
float32x4x2_t v;
|
2020-12-10 23:17:30 +00:00
|
|
|
v.val[0] = vcombine_f32 (vcreate_f32 (0UL), vcreate_f32 (0UL));
|
|
|
|
|
v.val[1] = vcombine_f32 (vcreate_f32 (0UL), vcreate_f32 (0UL));
|
2020-09-11 23:00:24 +00:00
|
|
|
vst1q_f32_x2(a, v);
|
|
|
|
|
return 0;
|
2024-05-28 22:33:53 +00:00
|
|
|
}"
|
|
|
|
|
HAS_VST1)
|
2020-09-11 23:00:24 +00:00
|
|
|
|
|
|
|
|
if(NOT HAS_VST1)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DMISSING_ARM_VST1")
|
|
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
|
2020-09-02 04:18:19 +00:00
|
|
|
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
|
|
|
|
include(CheckCSourceCompiles)
|
2024-05-28 22:33:53 +00:00
|
|
|
check_c_source_compiles(
|
|
|
|
|
"#include <arm_neon.h>
|
2020-09-02 04:18:19 +00:00
|
|
|
int main() {
|
|
|
|
|
float a[] = {1.0, 1.0};
|
|
|
|
|
vld1q_f32_x2(a);
|
|
|
|
|
return 0;
|
2024-05-28 22:33:53 +00:00
|
|
|
}"
|
|
|
|
|
HAS_VLD1)
|
2020-09-02 04:18:19 +00:00
|
|
|
|
|
|
|
|
if(NOT HAS_VLD1)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DMISSING_ARM_VLD1")
|
|
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
|
2020-08-15 00:14:27 +00:00
|
|
|
# Add code coverage flags to supported compilers
|
2020-09-11 22:51:33 +00:00
|
|
|
if(USE_CPP_CODE_COVERAGE)
|
2020-08-15 00:14:27 +00:00
|
|
|
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
2024-05-28 22:33:53 +00:00
|
|
|
string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path")
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path")
|
2020-08-15 00:14:27 +00:00
|
|
|
elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
2024-05-28 22:33:53 +00:00
|
|
|
string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping")
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS
|
|
|
|
|
" -fprofile-instr-generate -fcoverage-mapping")
|
2020-08-15 00:14:27 +00:00
|
|
|
else()
|
2024-05-28 22:33:53 +00:00
|
|
|
message(
|
|
|
|
|
ERROR
|
|
|
|
|
"Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported")
|
2020-08-15 00:14:27 +00:00
|
|
|
endif()
|
|
|
|
|
|
2020-07-09 21:12:15 +00:00
|
|
|
endif()
|
|
|
|
|
|
2020-03-25 20:43:00 +00:00
|
|
|
if(APPLE)
|
2024-05-28 22:33:53 +00:00
|
|
|
if(USE_MPS)
|
|
|
|
|
string(APPEND CMAKE_OBJCXX_FLAGS " -DUSE_MPS -fno-objc-arc")
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_MPS")
|
|
|
|
|
string(
|
|
|
|
|
APPEND
|
|
|
|
|
CMAKE_SHARED_LINKER_FLAGS
|
|
|
|
|
" -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal"
|
|
|
|
|
)
|
|
|
|
|
# To suppress MPSGraph availability warnings
|
|
|
|
|
append_cxx_flag_if_supported("-Wno-unguarded-availability-new"
|
|
|
|
|
CMAKE_OBJCXX_FLAGS)
|
|
|
|
|
endif()
|
|
|
|
|
append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
|
2018-08-31 20:08:20 +00:00
|
|
|
endif()
|
|
|
|
|
|
2024-07-26 07:05:35 +00:00
|
|
|
if(USE_XPU)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -DUSE_XPU")
|
|
|
|
|
endif()
|
|
|
|
|
|
2020-03-25 20:43:00 +00:00
|
|
|
if(EMSCRIPTEN)
|
2024-05-28 22:33:53 +00:00
|
|
|
string(
|
|
|
|
|
APPEND
|
|
|
|
|
CMAKE_CXX_FLAGS
|
|
|
|
|
" -Wno-implicit-function-declaration -DEMSCRIPTEN -s DISABLE_EXCEPTION_CATCHING=0"
|
|
|
|
|
)
|
2018-10-10 19:46:56 +00:00
|
|
|
endif()
|
|
|
|
|
|
2022-08-10 14:32:25 +00:00
|
|
|
append_cxx_flag_if_supported("-Wno-stringop-overflow" CMAKE_CXX_FLAGS)
|
2018-08-31 20:08:20 +00:00
|
|
|
|
Test application for profiling, CMake params for debug symbols (#28406)
Summary:
Reason:
To have one-step build for test android application based on the current code state that is ready for profiling with simpleperf, systrace etc. to profile performance inside the application.
## Parameters to control debug symbols stripping
Introducing /CMakeLists parameter `ANDROID_DEBUG_SYMBOLS` to be able not to strip symbols for pytorch (not add linker flag `-s`)
which is checked in `scripts/build_android.sh`
On gradle side stripping happens by default, and to prevent it we have to specify
```
android {
packagingOptions {
doNotStrip "**/*.so"
}
}
```
which is now controlled by new gradle property `nativeLibsDoNotStrip `
## Test_App
`android/test_app` - android app with one MainActivity that does inference in cycle
`android/build_test_app.sh` - script to build libtorch with debug symbols for specified android abis and adds `NDK_DEBUG=1` and `-PnativeLibsDoNotStrip=true` to keep all debug symbols for profiling.
Script assembles all debug flavors:
```
└─ $ find . -type f -name *apk
./test_app/app/build/outputs/apk/mobilenetQuant/debug/test_app-mobilenetQuant-debug.apk
./test_app/app/build/outputs/apk/resnet/debug/test_app-resnet-debug.apk
```
## Different build configurations
Module for inference can be set in `android/test_app/app/build.gradle` as a BuildConfig parameters:
```
productFlavors {
mobilenetQuant {
dimension "model"
applicationIdSuffix ".mobilenetQuant"
buildConfigField ("String", "MODULE_ASSET_NAME", buildConfigProps('MODULE_ASSET_NAME_MOBILENET_QUANT'))
addManifestPlaceholders([APP_NAME: "PyMobileNetQuant"])
buildConfigField ("String", "LOGCAT_TAG", "\"pytorch-mobilenet\"")
}
resnet {
dimension "model"
applicationIdSuffix ".resnet"
buildConfigField ("String", "MODULE_ASSET_NAME", buildConfigProps('MODULE_ASSET_NAME_RESNET18'))
addManifestPlaceholders([APP_NAME: "PyResnet"])
buildConfigField ("String", "LOGCAT_TAG", "\"pytorch-resnet\"")
}
```
In that case we can setup several apps on the same device for comparison, to separate packages `applicationIdSuffix`: 'org.pytorch.testapp.mobilenetQuant' and different application names and logcat tags as `manifestPlaceholder` and another BuildConfig parameter:
```
─ $ adb shell pm list packages | grep pytorch
package:org.pytorch.testapp.mobilenetQuant
package:org.pytorch.testapp.resnet
```
In future we can add another BuildConfig params e.g. single/multi threads and other configuration for profiling.
At the moment 2 flavors - for resnet18 and for mobilenetQuantized
which can be installed on connected device:
```
cd android
```
```
gradle test_app:installMobilenetQuantDebug
```
```
gradle test_app:installResnetDebug
```
## Testing:
```
cd android
sh build_test_app.sh
adb install -r test_app/app/build/outputs/apk/mobilenetQuant/debug/test_app-mobilenetQuant-debug.apk
```
```
cd $ANDROID_NDK
python simpleperf/run_simpleperf_on_device.py record --app org.pytorch.testapp.mobilenetQuant -g --duration 10 -o /data/local/tmp/perf.data
adb pull /data/local/tmp/perf.data
python simpleperf/report_html.py
```
Simpleperf report has all symbols:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/28406
Differential Revision: D18386622
Pulled By: IvanKobzarev
fbshipit-source-id: 3a751192bbc4bc3c6d7f126b0b55086b4d586e7a
2019-11-08 22:17:15 +00:00
|
|
|
if(ANDROID AND (NOT ANDROID_DEBUG_SYMBOLS))
|
2018-03-02 14:24:05 +00:00
|
|
|
if(CMAKE_COMPILER_IS_GNUCXX)
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -s")
|
2020-11-10 22:51:50 +00:00
|
|
|
elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -g0")
|
2018-03-02 14:24:05 +00:00
|
|
|
else()
|
2020-08-12 17:25:48 +00:00
|
|
|
string(APPEND CMAKE_EXE_LINKER_FLAGS " -s")
|
2018-03-02 14:24:05 +00:00
|
|
|
endif()
|
|
|
|
|
endif()
|
|
|
|
|
|
2017-05-22 17:21:13 +00:00
|
|
|
if(NOT APPLE AND UNIX)
|
|
|
|
|
list(APPEND Caffe2_DEPENDENCY_LIBS dl)
|
|
|
|
|
endif()
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# Prefix path to Caffe2 headers. If a directory containing installed Caffe2
|
|
|
|
|
# headers was inadvertently added to the list of include directories, prefixing
|
2017-08-30 17:12:35 +00:00
|
|
|
# PROJECT_SOURCE_DIR means this source tree always takes precedence.
|
|
|
|
|
include_directories(BEFORE ${PROJECT_SOURCE_DIR})
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# Prefix path to generated Caffe2 headers. These need to take precedence over
|
|
|
|
|
# their empty counterparts located in PROJECT_SOURCE_DIR.
|
2017-01-26 22:29:51 +00:00
|
|
|
include_directories(BEFORE ${PROJECT_BINARY_DIR})
|
2017-01-12 00:51:02 +00:00
|
|
|
|
2018-07-31 04:02:13 +00:00
|
|
|
include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/)
|
2023-10-10 17:47:35 +00:00
|
|
|
include_directories(BEFORE ${CMAKE_BINARY_DIR}/aten/src/)
|
2018-07-31 04:02:13 +00:00
|
|
|
|
2023-06-27 08:53:23 +00:00
|
|
|
if(USE_MIMALLOC)
|
2023-07-06 04:44:17 +00:00
|
|
|
set(MI_OVERRIDE OFF)
|
|
|
|
|
set(MI_BUILD_SHARED OFF)
|
|
|
|
|
set(MI_BUILD_OBJECT OFF)
|
|
|
|
|
set(MI_BUILD_TESTS OFF)
|
2023-06-27 08:53:23 +00:00
|
|
|
add_definitions(-DUSE_MIMALLOC)
|
|
|
|
|
add_subdirectory(third_party/mimalloc)
|
|
|
|
|
include_directories(third_party/mimalloc/include)
|
|
|
|
|
endif()
|
|
|
|
|
|
[Windows][cpu] mkl use mimalloc as allocator on Windows (#138419)
We did a lot of optimization for PyTorch Windows, and we got good progress of it. But still some models have performance gap between PyTorch Windows and PyTorch Linux. Ref: https://pytorch.org/blog/performance-boost-windows/#conclusion
From the blog conclusion, we found the `ResNet50` is typical case of it.
Let's focus on the `ResNet50`, and collect the profiling log:
```cmd
(nightly) D:\xu_git\dnnl_cb>python test_script_resnet50.py
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
model_inference 3.91% 682.427ms 100.00% 17.448s 17.448s 1
aten::conv2d 0.18% 30.906ms 64.79% 11.305s 2.133ms 5300
aten::convolution 0.45% 78.031ms 64.62% 11.275s 2.127ms 5300
aten::_convolution 0.30% 51.670ms 64.17% 11.196s 2.113ms 5300
aten::mkldnn_convolution 63.58% 11.093s 63.87% 11.145s 2.103ms 5300
aten::batch_norm 0.13% 23.536ms 20.10% 3.506s 661.580us 5300
aten::_batch_norm_impl_index 0.28% 49.486ms 19.96% 3.483s 657.139us 5300
aten::native_batch_norm 19.26% 3.360s 19.64% 3.427s 646.615us 5300
aten::max_pool2d 0.01% 1.038ms 5.84% 1.018s 10.181ms 100
aten::max_pool2d_with_indices 5.83% 1.017s 5.83% 1.017s 10.171ms 100
aten::add_ 3.38% 588.907ms 3.38% 588.907ms 85.349us 6900
aten::relu_ 0.35% 60.358ms 1.67% 292.155ms 59.624us 4900
aten::clamp_min_ 1.33% 231.797ms 1.33% 231.797ms 47.306us 4900
aten::empty 0.46% 80.195ms 0.46% 80.195ms 1.513us 53000
aten::linear 0.01% 927.300us 0.23% 39.353ms 393.532us 100
aten::addmm 0.20% 35.379ms 0.21% 37.016ms 370.155us 100
aten::empty_like 0.12% 20.455ms 0.17% 29.976ms 5.656us 5300
aten::as_strided_ 0.11% 18.830ms 0.11% 18.830ms 3.553us 5300
aten::adaptive_avg_pool2d 0.00% 419.900us 0.08% 14.265ms 142.647us 100
aten::mean 0.01% 1.737ms 0.08% 13.845ms 138.448us 100
aten::sum 0.05% 8.113ms 0.05% 8.648ms 86.479us 100
aten::resize_ 0.03% 5.182ms 0.03% 5.182ms 0.978us 5300
aten::div_ 0.01% 1.445ms 0.02% 3.460ms 34.600us 100
aten::to 0.00% 337.000us 0.01% 2.015ms 20.154us 100
aten::_to_copy 0.01% 977.500us 0.01% 1.678ms 16.784us 100
aten::copy_ 0.01% 1.474ms 0.01% 1.474ms 7.371us 200
aten::t 0.00% 775.900us 0.01% 1.410ms 14.104us 100
aten::flatten 0.00% 420.900us 0.01% 1.311ms 13.106us 100
aten::view 0.01% 889.700us 0.01% 889.700us 8.897us 100
aten::transpose 0.00% 410.700us 0.00% 634.500us 6.345us 100
aten::expand 0.00% 496.800us 0.00% 566.800us 5.668us 100
aten::fill_ 0.00% 534.800us 0.00% 534.800us 5.348us 100
aten::as_strided 0.00% 293.800us 0.00% 293.800us 1.469us 200
aten::empty_strided 0.00% 241.700us 0.00% 241.700us 2.417us 100
aten::resolve_conj 0.00% 54.800us 0.00% 54.800us 0.274us 200
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 17.448s
Execution time: 20.02380895614624
```
We found the major kernel consume CPU resource is `aten::mkldnn_convolution`. It was dispatched to `MKLDNN`.
Acturally, we had optimized memory allocation via integrated mimalloc to pytorch C10 module. It helps PyTorch Windows boost a lot, but it does not cover `MKL` and `MKLDNN`'s intermediary temporary memory.
We still have potential to improve PyTorch Windows performance via optimize `MKL` and `MKLDNN`'s intermediary temporary memory.
So, I discussed with Intel MKL team, and get a method to register high performance memory allocation API to MKL, and it would help MKL to boost memory performance. Please check the online document: https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-windows/2023-0/redefining-memory-functions.html
This PR is optimize MKL memory alloction performance on Windows, via register mi_malloc to MKL. PR Changes:
1. Add cmake option: `USE_MIMALLOC_ON_MKL`, It is sub-option of `USE_MIMALLOC`.
2. Wrap and export mi_malloc APIs in C10, when `USE_MIMALLOC_ON_MKL` is `ON`.
3. Add MklAllocationHelp.cpp to register allocation APIs to MKL, when `USE_MIMALLOC_ON_MKL` is `ON`.
For `oneDNN`, it is still tracking in this proposal: https://github.com/oneapi-src/oneDNN/issues/1898
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138419
Approved by: https://github.com/jgong5, https://github.com/ezyang
2024-10-24 05:29:47 +00:00
|
|
|
if(USE_MIMALLOC AND USE_MIMALLOC_ON_MKL)
|
|
|
|
|
add_definitions(-DUSE_MIMALLOC_ON_MKL)
|
|
|
|
|
endif()
|
|
|
|
|
|
2016-12-12 17:29:00 +00:00
|
|
|
# ---[ Main build
|
2018-09-24 18:02:46 +00:00
|
|
|
add_subdirectory(c10)
|
2016-12-05 00:42:00 +00:00
|
|
|
add_subdirectory(caffe2)
|
2016-12-08 18:23:04 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# ---[ CMake related files Uninistall option.
|
2018-01-29 18:00:43 +00:00
|
|
|
if(NOT TARGET caffe2_uninstall)
|
2017-08-09 05:01:09 +00:00
|
|
|
configure_file(
|
2024-05-28 22:33:53 +00:00
|
|
|
${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
|
|
|
|
|
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake IMMEDIATE @ONLY)
|
2017-08-09 05:01:09 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
add_custom_target(
|
|
|
|
|
caffe2_uninstall COMMAND ${CMAKE_COMMAND} -P
|
|
|
|
|
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
|
2017-08-09 05:01:09 +00:00
|
|
|
endif()
|
2017-10-19 16:55:56 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# ---[ Make configuration files for cmake to allow dependent libraries easier
|
|
|
|
|
# access to Caffe2.
|
2017-10-19 16:55:56 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
if((NOT USE_GLOG)
|
|
|
|
|
OR(NOT USE_GFLAGS)
|
|
|
|
|
OR BUILD_CUSTOM_PROTOBUF)
|
|
|
|
|
message(WARNING "Generated cmake files are only fully tested if one builds "
|
|
|
|
|
"with system glog, gflags, and protobuf. Other settings may "
|
|
|
|
|
"generate files that are not well tested.")
|
2017-12-15 19:48:08 +00:00
|
|
|
endif()
|
2018-02-21 05:39:00 +00:00
|
|
|
|
2020-03-25 20:43:00 +00:00
|
|
|
if(USE_CUDA OR USE_ROCM)
|
2024-05-28 22:33:53 +00:00
|
|
|
# TODO: check if we should include other cuda dependency libraries to the
|
|
|
|
|
# interface as well.
|
2018-02-28 04:42:37 +00:00
|
|
|
|
2017-10-19 16:55:56 +00:00
|
|
|
endif()
|
|
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# Note(jiayq): when building static libraries, all PRIVATE dependencies will
|
|
|
|
|
# also become interface libraries, and as a result if there are any dependency
|
|
|
|
|
# libraries that are not exported, the following install export script will
|
|
|
|
|
# fail. As a result, we will only provide the targets cmake files for shared lib
|
|
|
|
|
# installation. For more info, read:
|
2017-12-15 19:48:08 +00:00
|
|
|
# https://cmake.org/pipermail/cmake/2016-May/063400.html
|
2020-03-25 20:43:00 +00:00
|
|
|
if(BUILD_SHARED_LIBS)
|
2024-05-28 22:33:53 +00:00
|
|
|
configure_file(${PROJECT_SOURCE_DIR}/cmake/Caffe2Config.cmake.in
|
|
|
|
|
${PROJECT_BINARY_DIR}/Caffe2Config.cmake @ONLY)
|
|
|
|
|
install(
|
|
|
|
|
FILES ${PROJECT_BINARY_DIR}/Caffe2Config.cmake
|
|
|
|
|
DESTINATION share/cmake/Caffe2
|
|
|
|
|
COMPONENT dev)
|
|
|
|
|
install(
|
|
|
|
|
FILES ${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake
|
|
|
|
|
${PROJECT_SOURCE_DIR}/cmake/public/xpu.cmake
|
|
|
|
|
${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake
|
|
|
|
|
${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
|
|
|
|
|
${PROJECT_SOURCE_DIR}/cmake/public/mkl.cmake
|
|
|
|
|
${PROJECT_SOURCE_DIR}/cmake/public/mkldnn.cmake
|
|
|
|
|
${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
|
|
|
|
|
${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
|
|
|
|
|
${PROJECT_SOURCE_DIR}/cmake/public/LoadHIP.cmake
|
|
|
|
|
DESTINATION share/cmake/Caffe2/public
|
|
|
|
|
COMPONENT dev)
|
|
|
|
|
install(
|
|
|
|
|
DIRECTORY ${PROJECT_SOURCE_DIR}/cmake/Modules_CUDA_fix
|
|
|
|
|
DESTINATION share/cmake/Caffe2/
|
|
|
|
|
COMPONENT dev)
|
|
|
|
|
install(
|
|
|
|
|
FILES ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUDAToolkit.cmake
|
|
|
|
|
DESTINATION share/cmake/Caffe2/
|
|
|
|
|
COMPONENT dev)
|
|
|
|
|
install(
|
|
|
|
|
FILES ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake
|
|
|
|
|
DESTINATION share/cmake/Caffe2/
|
|
|
|
|
COMPONENT dev)
|
2024-08-22 07:57:30 +00:00
|
|
|
install(
|
|
|
|
|
FILES ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUDSS.cmake
|
|
|
|
|
DESTINATION share/cmake/Caffe2/
|
|
|
|
|
COMPONENT dev)
|
2024-05-28 22:33:53 +00:00
|
|
|
install(
|
|
|
|
|
FILES ${PROJECT_SOURCE_DIR}/cmake/Modules/FindSYCLToolkit.cmake
|
|
|
|
|
DESTINATION share/cmake/Caffe2/
|
|
|
|
|
COMPONENT dev)
|
|
|
|
|
if(NOT BUILD_LIBTORCHLESS)
|
|
|
|
|
install(
|
|
|
|
|
EXPORT Caffe2Targets
|
2017-10-19 16:55:56 +00:00
|
|
|
DESTINATION share/cmake/Caffe2
|
|
|
|
|
FILE Caffe2Targets.cmake
|
|
|
|
|
COMPONENT dev)
|
2024-05-28 22:33:53 +00:00
|
|
|
endif()
|
2017-12-15 19:48:08 +00:00
|
|
|
else()
|
2024-05-28 22:33:53 +00:00
|
|
|
message(WARNING "Generated cmake files are only available when building "
|
|
|
|
|
"shared libs.")
|
2017-10-19 16:55:56 +00:00
|
|
|
endif()
|
2017-10-26 19:20:50 +00:00
|
|
|
|
2024-05-28 22:33:53 +00:00
|
|
|
# ---[ Binaries Binaries will be built after the Caffe2 main libraries and the
|
|
|
|
|
# modules are built. For the binaries, they will be linked to the Caffe2 main
|
2018-03-06 22:45:21 +00:00
|
|
|
# libraries, as well as all the modules that are built with Caffe2 (the ones
|
|
|
|
|
# built in the previous Modules section above).
|
2020-03-25 20:43:00 +00:00
|
|
|
if(BUILD_BINARY)
|
2018-08-31 20:08:20 +00:00
|
|
|
add_subdirectory(binaries)
|
|
|
|
|
endif()
|
2018-03-06 22:45:21 +00:00
|
|
|
|
2019-11-15 21:54:00 +00:00
|
|
|
# ---[ JNI
|
2020-03-25 20:43:00 +00:00
|
|
|
if(BUILD_JNI)
|
2021-10-05 03:38:10 +00:00
|
|
|
if(NOT MSVC)
|
|
|
|
|
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable")
|
|
|
|
|
endif()
|
2019-11-15 21:54:00 +00:00
|
|
|
set(BUILD_LIBTORCH_WITH_JNI 1)
|
|
|
|
|
set(FBJNI_SKIP_TESTS 1)
|
|
|
|
|
add_subdirectory(android/pytorch_android)
|
|
|
|
|
endif()
|
|
|
|
|
|
2018-03-01 20:01:44 +00:00
|
|
|
include(cmake/Summary.cmake)
|
|
|
|
|
caffe2_print_configuration_summary()
|
2021-01-29 03:27:29 +00:00
|
|
|
|
2022-09-13 16:36:57 +00:00
|
|
|
if(BUILD_FUNCTORCH)
|
|
|
|
|
add_subdirectory(functorch)
|
|
|
|
|
endif()
|
2023-10-23 14:00:54 +00:00
|
|
|
|
|
|
|
|
# Parse custom debug info
|
|
|
|
|
if(DEFINED USE_CUSTOM_DEBINFO)
|
2024-05-28 22:33:53 +00:00
|
|
|
string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
|
|
|
|
|
message(STATUS "Source files with custom debug infos: ${SOURCE_FILES}")
|
|
|
|
|
|
|
|
|
|
string(REGEX REPLACE " +" ";" SOURCE_FILES_LIST "${SOURCE_FILES}")
|
|
|
|
|
|
|
|
|
|
# Set the COMPILE_FLAGS property for each source file
|
|
|
|
|
foreach(SOURCE_FILE ${SOURCE_FILES_LIST})
|
|
|
|
|
# We have to specify the scope here. We do this by specifying the targets we
|
|
|
|
|
# care about and caffe2/ for all test targets defined there
|
|
|
|
|
if(BUILD_LIBTORCHLESS)
|
2024-08-20 16:33:26 +00:00
|
|
|
caffe2_update_option(USE_CUDA OFF)
|
2024-05-28 22:33:53 +00:00
|
|
|
set(ALL_PT_TARGETS "torch_python;${C10_LIB};${TORCH_CPU_LIB};${TORCH_LIB}")
|
|
|
|
|
else()
|
|
|
|
|
# @todo test if we can remove this
|
|
|
|
|
set(ALL_PT_TARGETS "torch_python;c10;torch_cpu;torch")
|
|
|
|
|
endif()
|
|
|
|
|
set_source_files_properties(
|
|
|
|
|
${SOURCE_FILE} DIRECTORY "caffe2/" TARGET_DIRECTORY ${ALL_PT_TARGETS}
|
|
|
|
|
PROPERTIES COMPILE_FLAGS "-g")
|
|
|
|
|
endforeach()
|
|
|
|
|
|
|
|
|
|
# Link everything with debug info when any file is in debug mode
|
|
|
|
|
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -g")
|
|
|
|
|
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -g")
|
2023-10-23 14:00:54 +00:00
|
|
|
endif()
|
2024-02-15 02:08:57 +00:00
|
|
|
|
|
|
|
|
# Bundle PTXAS if needed
|
|
|
|
|
if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
|
2024-05-28 22:33:53 +00:00
|
|
|
if(NOT EXISTS "${PROJECT_SOURCE_DIR}/build/bin/ptxas")
|
|
|
|
|
message(STATUS "Copying PTXAS into the bin folder")
|
|
|
|
|
file(COPY "${CUDAToolkit_BIN_DIR}/ptxas"
|
|
|
|
|
DESTINATION "${PROJECT_BINARY_DIR}")
|
|
|
|
|
endif()
|
|
|
|
|
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
|
|
|
|
|
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
2024-02-15 02:08:57 +00:00
|
|
|
endif()
|