pytorch/CMakeLists.txt

978 lines
38 KiB
Text
Raw Normal View History

cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
#cmake_policy(SET CMP0022 NEW)
#cmake_policy(SET CMP0023 NEW)
2016-12-05 00:42:00 +00:00
# Use compiler ID "AppleClang" instead of "Clang" for XCode.
# Not setting this sometimes makes XCode C compiler gets detected as "Clang",
# even when the C++ one is detected as "AppleClang".
Fix cmake backslash syntax error on Windows. (#24420) Summary: ``` [1/1424] Building NVCC (Device) object caffe2/CMakeFiles/torch.dir/operators/torch_generated_weighted_sample_op.cu.obj CMake Warning (dev) at torch_generated_weighted_sample_op.cu.obj.Release.cmake:82 (set): Syntax error in cmake code at C:/Users/Ganzorig/pytorch/build/caffe2/CMakeFiles/torch.dir/operators/torch_generated_weighted_sample_op.cu.obj.Release.cmake:82 when parsing string C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/include;C:/Users/Ganzorig/pytorch/aten/src;C:/Users/Ganzorig/pytorch/build;C:/Users/Ganzorig/pytorch;C:/Users/Ganzorig/pytorch/cmake/../third_party/googletest/googlemock/include;C:/Users/Ganzorig/pytorch/cmake/../third_party/googletest/googletest/include;;C:/Users/Ganzorig/pytorch/third_party/protobuf/src;C:/Users/Ganzorig/pytorch/cmake/../third_party/benchmark/include;C:/Users/Ganzorig/pytorch/cmake/../third_party/eigen;C:/Users/Ganzorig/Anaconda3/envs/code/include;C:/Users/Ganzorig/Anaconda3/envs/code/lib/site-packages/numpy/core/include;C:/Users/Ganzorig/pytorch/cmake/../third_party/pybind11/include;C:/Users/Ganzorig/pytorch/cmake/../third_party/cub;C:/Users/Ganzorig/pytorch/build/caffe2/contrib/aten;C:/Users/Ganzorig/pytorch/third_party/onnx;C:/Users/Ganzorig/pytorch/build/third_party/onnx;C:/Users/Ganzorig/pytorch/third_party/foxi;C:/Users/Ganzorig/pytorch/build/third_party/foxi;C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/include;C:/Users/Ganzorig/pytorch/caffe2/../torch/csrc/api;C:/Users/Ganzorig/pytorch/caffe2/../torch/csrc/api/include;C:/Program Files/NVIDIA Corporation/NvToolsExt/include;C:/Users/Ganzorig/pytorch/caffe2/aten/src/TH;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src/TH;C:/Users/Ganzorig/pytorch/caffe2/../torch/../aten/src;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src;C:/Users/Ganzorig/pytorch/build/aten/src;C:/Users/Ganzorig/pytorch/caffe2/../torch/../aten/src;C:/Users/Ganzorig/pytorch/build/caffe2/../aten/src;C:/Users/Ganzorig/pytorch/build/caffe2/../aten/src/ATen;C:/Users/Ganzorig/pytorch/build/aten/src;C:/Users/Ganzorig/pytorch/caffe2/../torch/csrc;C:/Users/Ganzorig/pytorch/caffe2/../torch/../third_party/miniz-2.0.8;C:/Users/Ganzorig/pytorch/caffe2/../torch/csrc/api;C:/Users/Ganzorig/pytorch/caffe2/../torch/csrc/api/include;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src/TH;C:/Users/Ganzorig/pytorch/aten/src/TH;C:/Users/Ganzorig/pytorch/aten/src;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src;C:/Users/Ganzorig/pytorch/build/aten/src;C:/Users/Ganzorig/pytorch/aten/src;C:/Users/Ganzorig/pytorch/aten/../third_party/catch/single_include;C:/Users/Ganzorig/pytorch/aten/src/ATen/..;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src/ATen;C:/Users/Ganzorig/pytorch/third_party/miniz-2.0.8;C:/Users/Ganzorig/pytorch/caffe2/core/nomnigraph/include;C:/Users/Ganzorig/pytorch/caffe2/;C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/include;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src/TH;C:/Users/Ganzorig/pytorch/aten/src/TH;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src/THC;C:/Users/Ganzorig/pytorch/aten/src/THC;C:/Users/Ganzorig/pytorch/aten/src/THCUNN;C:/Users/Ganzorig/pytorch/aten/src/ATen/cuda;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src/TH;C:/Users/Ganzorig/pytorch/aten/src/TH;C:/Users/Ganzorig/pytorch/aten/src;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src;C:/Users/Ganzorig/pytorch/build/aten/src;C:/Users/Ganzorig/pytorch/aten/src;C:/Users/Ganzorig/pytorch/aten/../third_party/catch/single_include;C:/Users/Ganzorig/pytorch/aten/src/ATen/..;C:/Users/Ganzorig/pytorch/build/caffe2/aten/src/ATen;C:/Users/Ganzorig/pytorch/third_party/protobuf/src;C:/Users/Ganzorig/pytorch/c10/../;C:/Users/Ganzorig/pytorch/build;C:/Users/Ganzorig/pytorch/third_party/cpuinfo/include;C:/Users/Ganzorig/pytorch/third_party/FP16/include;C:/Users/Ganzorig/pytorch/third_party/foxi;C:/Users/Ganzorig/pytorch/third_party/foxi;C:/Users/Ganzorig/pytorch/third_party/onnx;C:/Users/Ganzorig/pytorch/build/third_party/onnx;C:/Users/Ganzorig/pytorch/build/third_party/onnx;C:/Users/Ganzorig/pytorch/c10/cuda/../..;C:/Users/Ganzorig/pytorch/build;C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/include;C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/include;C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/include;C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1\include;C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/include Invalid escape sequence \i Policy CMP0010 is not set: Bad variable reference syntax is an error. Run "cmake --help-policy CMP0010" for policy details. Use the cmake_policy command to set the policy and suppress this warning. This warning is for project developers. Use -Wno-dev to suppress it. ``` Compared to https://github.com/pytorch/pytorch/issues/24044 , this commit moves the fix up, and uses [bracket arguments](https://cmake.org/cmake/help/v3.12/manual/cmake-language.7.html#bracket-argument). PR also sent to upstream: https://gitlab.kitware.com/cmake/cmake/merge_requests/3679 Pull Request resolved: https://github.com/pytorch/pytorch/pull/24420 Differential Revision: D16914193 Pulled By: ezyang fbshipit-source-id: 9f897cf4f607502a16dbd1045f2aedcb49c38da7
2019-08-20 08:23:37 +00:00
cmake_policy(SET CMP0010 NEW)
cmake_policy(SET CMP0025 NEW)
Turn off warnings on Windows CI. (#24331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/24331 Currently our logs are something like 40M a pop. Turning off warnings and turning on verbose makefiles (to see the compile commands) reduces this to more like 8M. We could probably reduce log size more but verbose makefile is really useful and we'll keep it turned on for Windows. Some findings: 1. Setting `CMAKE_VERBOSE_MAKEFILE` inside CMakelists.txt itself as suggested in https://github.com/ninja-build/ninja/issues/900#issuecomment-417917630 does not work on Windows. Setting `-DCMAKE_VERBOSE_MAKEFILE=1` does work (and we respect this environment variable.) 2. The high (`/W3`) warning level is by default on MSVC is due to cmake inserting this in the default flags. On recent versions of cmake, CMP0092 can be used to disable this flag in the default set. The string replace trick sort of works, but the standard snippet you'll find on the internet won't disable the flag from nvcc. I inspected the CUDA cmake code and verified it does respect CMP0092 3. `EHsc` is also in the default flags; this one cannot be suppressed via a policy. The string replace trick seems to work... 4. ... however, it seems nvcc implicitly inserts an `/EHs` after `-Xcompiler` specified flags, which means that if we add `/EHa` to our set of flags, you'll get a warning from nvcc. So we probably have to figure out how to exclude EHa from the nvcc flags set (EHs does seem to work fine.) 5. To suppress warnings in nvcc, you must BOTH pass `-w` and `-Xcompiler /w`. Individually these are not enough. The patch applies these things; it also fixes a bug where nvcc verbose command printing doesn't work with `-GNinja`. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D17131746 Pulled By: ezyang fbshipit-source-id: fb142f8677072a5430664b28155373088f074c4b
2019-08-30 14:09:30 +00:00
# Suppress warning flags in default MSVC configuration. It's not
# mandatory that we do this (and we don't if cmake is old), but it's
# nice when it's possible, and it's possible on our Windows configs.
if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
cmake_policy(SET CMP0092 NEW)
endif()
if(NOT CMAKE_VERSION VERSION_LESS 3.10)
set(FIND_CUDA_MODULE_DEPRECATED ON)
endif()
# ---[ Project and semantic versioning.
project(Torch CXX C)
2016-12-05 00:42:00 +00:00
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(LINUX TRUE)
else()
set(LINUX FALSE)
endif()
set(CMAKE_INSTALL_MESSAGE NEVER)
# check and set CMAKE_CXX_STANDARD
string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
if(env_cxx_standard GREATER -1)
message(
WARNING "C++ standard version definition detected in environment variable."
"PyTorch requires -std=c++14. Please remove -std=c++ settings in your environment.")
endif()
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_C_STANDARD 11)
if(DEFINED GLIBCXX_USE_CXX11_ABI)
if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
set(CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
endif()
endif()
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
2018-03-01 20:01:44 +00:00
# One variable that determines whether the current cmake process is being run
# with the main Caffe2 library. This is useful for building modules - if
# modules are built with the main Caffe2 library then one does not need to do
# find caffe2 in the cmake script. One can usually guard it in some way like
# if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
2018-03-01 20:01:44 +00:00
# find_package(Caffe2 REQUIRED)
# endif()
set(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO ON)
# Googletest's cmake files are going to set it on once they are processed. Let's
# set it at the very beginning so that the entire build is deterministic.
set(THREADS_PREFER_PTHREAD_FLAG ON)
if(NOT DEFINED BLAS_SET_BY_USER)
if(DEFINED BLAS)
set(BLAS_SET_BY_USER TRUE)
else()
message(STATUS "Not forcing any particular BLAS to be found")
set(BLAS_SET_BY_USER FALSE)
endif()
set(BLAS_SET_BY_USER ${BLAS_SET_BY_USER} CACHE STRING "Marks whether BLAS was manually set by user or auto-detected")
endif()
# Apple specific
if(APPLE)
# These lines are an attempt to make find_package(cuda) pick up
# libcuda.dylib, and not cuda.framework. It doesn't work all
# the time, but it seems to help for some users.
# TODO: replace this with a more robust fix
set(CMAKE_FIND_FRAMEWORK LAST)
set(CMAKE_FIND_APPBUNDLE LAST)
# Get clang version on macOS
execute_process( COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version_string )
string(REGEX REPLACE "Apple LLVM version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION_STRING ${clang_full_version_string})
message( STATUS "CLANG_VERSION_STRING: " ${CLANG_VERSION_STRING} )
# RPATH stuff
set(CMAKE_MACOSX_RPATH ON)
if(NOT IOS)
# Determine if we can link against ML Compute
set(MLCOMPUTE_FOUND OFF)
execute_process(
COMMAND bash -c "xcrun --sdk macosx --show-sdk-path"
OUTPUT_VARIABLE _macosx_sdk_path
OUTPUT_STRIP_TRAILING_WHITESPACE)
set(_SDK_SEARCH_PATH "${_macosx_sdk_path}/System/Library/Frameworks/")
set(_FRAMEWORK_SEARCH_PATH "/System/Library/Frameworks/")
find_library(_MLCompute_fwrk_path_ NAMES MLCompute PATHS ${_FRAMEWORK_SEARCH_PATH} NO_DEFAULT_PATH)
find_library(_MLCompute_sdk_path_ NAMES MLCompute PATHS ${_SDK_SEARCH_PATH} NO_DEFAULT_PATH)
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mlc)
set(_MLC_FOLDER_EXISTS YES)
else()
set(_MLC_FOLDER_EXISTS NO)
endif()
if(_MLCompute_fwrk_path_ AND _MLCompute_sdk_path_ AND _MLC_FOLDER_EXISTS)
set(MLCOMPUTE_FOUND ON)
message(STATUS "ML Compute framework found")
else()
message(STATUS "ML Compute framework not found")
endif()
endif()
endif()
set(CPU_AARCH64 OFF)
set(CPU_INTEL OFF)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|x86_64)")
set(CPU_INTEL ON)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)")
set(CPU_AARCH64 ON)
endif()
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
# It is not tested and likely won't work without additional changes.
if(NOT LINUX AND NOT WIN32)
set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed")
# On macOS, if USE_DISTRIBUTED is enabled (specified by the user),
# then make Gloo build with the libuv transport.
if(APPLE AND USE_DISTRIBUTED)
set(USE_LIBUV ON CACHE STRING "")
endif()
endif()
# ---[ Options.
# Note to developers: if you add an option below, make sure you also add it to
# cmake/Summary.cmake so that the summary prints out the option values.
include(CMakeDependentOption)
option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
option(BUILD_BINARY "Build C++ binaries" OFF)
option(BUILD_DOCS "Build Caffe2 documentation" OFF)
option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
option(BUILD_PYTHON "Build Python binaries" ON)
option(BUILD_CAFFE2 "Master flag to build Caffe2" ON)
[PyTorch] update CMake to build libtorch lite (#51419) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51419 ## Summary 1. Add an option `BUILD_LITE_INTERPRETER` in `caffe2/CMakeLists.txt` and set `OFF` as default. 2. Update 'build_android.sh' with an argument to swtich `BUILD_LITE_INTERPRETER`, 'OFF' as default. 3. Add a mini demo app `lite_interpreter_demo` linked with `libtorch` library, which can be used for quick test. ## Test Plan Built lite interpreter version of libtorch and test with Image Segmentation demo app ([android version](https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation)/[ios version](https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation)) ### Android 1. **Prepare model**: Prepare the lite interpreter version of model by run the script below to generate the scripted model `deeplabv3_scripted.pt` and `deeplabv3_scripted.ptl` ``` import torch model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True) model.eval() scripted_module = torch.jit.script(model) # Export full jit version model (not compatible lite interpreter), leave it here for comparison scripted_module.save("deeplabv3_scripted.pt") # Export lite interpreter version model (compatible with lite interpreter) scripted_module._save_for_lite_interpreter("deeplabv3_scripted.ptl") ``` 2. **Build libtorch lite for android**: Build libtorch for android for all 4 android abis (armeabi-v7a, arm64-v8a, x86, x86_64) `BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh`. This pr is tested on Pixel 4 emulator with x86, so use cmd `BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh x86` to specify abi to save built time. After the build finish, it will show the library path: ``` ... BUILD SUCCESSFUL in 55s 134 actionable tasks: 22 executed, 112 up-to-date + find /Users/chenlai/pytorch/android -type f -name '*aar' + xargs ls -lah -rw-r--r-- 1 chenlai staff 13M Feb 11 11:48 /Users/chenlai/pytorch/android/pytorch_android/build/outputs/aar/pytorch_android-release.aar -rw-r--r-- 1 chenlai staff 36K Feb 9 16:45 /Users/chenlai/pytorch/android/pytorch_android_torchvision/build/outputs/aar/pytorch_android_torchvision-release.aar ``` 3. **Use the PyTorch Android libraries built from source in the ImageSegmentation app**: Create a folder 'libs' in the path, the path from repository root will be `ImageSegmentation/app/libs`. Copy `pytorch_android-release` to the path `ImageSegmentation/app/libs/pytorch_android-release.aar`. Copy 'pytorch_android_torchvision` (downloaded from [here](https://oss.sonatype.org/#nexus-search;quick~torchvision_android)) to the path `ImageSegmentation/app/libs/pytorch_android_torchvision.aar` Update the `dependencies` part of `ImageSegmentation/app/build.gradle` to ``` dependencies { implementation 'androidx.appcompat:appcompat:1.2.0' implementation 'androidx.constraintlayout:constraintlayout:2.0.2' testImplementation 'junit:junit:4.12' androidTestImplementation 'androidx.test.ext:junit:1.1.2' androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0' implementation(name:'pytorch_android-release', ext:'aar') implementation(name:'pytorch_android_torchvision', ext:'aar') implementation 'com.android.support:appcompat-v7:28.0.0' implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3' } ``` Update `allprojects` part in `ImageSegmentation/build.gradle` to ``` allprojects { repositories { google() jcenter() flatDir { dirs 'libs' } } } ``` 4. **Update model loader api**: Update `ImageSegmentation/app/src/main/java/org/pytorch/imagesegmentation/MainActivity.java` by 4.1 Add new import: `import org.pytorch.LiteModuleLoader;` 4.2 Replace the way to load pytorch lite model ``` // mModule = Module.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.pt")); mModule = LiteModuleLoader.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.ptl")); ``` 5. **Test app**: Build and run the ImageSegmentation app in Android Studio, ![image](https://user-images.githubusercontent.com/16430979/107696279-9cea5900-6c66-11eb-8286-4d1d68abff61.png) ### iOS 1. **Prepare model**: Same as Android. 2. **Build libtorch lite for ios** `BUILD_PYTORCH_MOBILE=1 IOS_PLATFORM=SIMULATOR BUILD_LITE_INTERPRETER=1 ./scripts/build_ios.sh` 3. **Remove Cocoapods from the project**: run `pod deintegrate` 4. **Link ImageSegmentation demo app with the custom built library**: Open your project in XCode, go to your project Target’s **Build Phases - Link Binaries With Libraries**, click the **+** sign and add all the library files located in `build_ios/install/lib`. Navigate to the project **Build Settings**, set the value **Header Search Paths** to `build_ios/install/include` and **Library Search Paths** to `build_ios/install/lib`. In the build settings, search for **other linker flags**. Add a custom linker flag below ``` -all_load ``` Finally, disable bitcode for your target by selecting the Build Settings, searching for Enable Bitcode, and set the value to No. ** 5. Update library and api** 5.1 Update `TorchModule.mm`` To use the custom built libraries the project, replace `#import <LibTorch/LibTorch.h>` (in `TorchModule.mm`) which is needed when using LibTorch via Cocoapods with the code below: ``` //#import <LibTorch/LibTorch.h> #include "ATen/ATen.h" #include "caffe2/core/timer.h" #include "caffe2/utils/string_utils.h" #include "torch/csrc/autograd/grad_mode.h" #include "torch/script.h" #include <torch/csrc/jit/mobile/function.h> #include <torch/csrc/jit/mobile/import.h> #include <torch/csrc/jit/mobile/interpreter.h> #include <torch/csrc/jit/mobile/module.h> #include <torch/csrc/jit/mobile/observer.h> ``` 5.2 Update `ViewController.swift` ``` // if let filePath = Bundle.main.path(forResource: // "deeplabv3_scripted", ofType: "pt"), // let module = TorchModule(fileAtPath: filePath) { // return module // } else { // fatalError("Can't find the model file!") // } if let filePath = Bundle.main.path(forResource: "deeplabv3_scripted", ofType: "ptl"), let module = TorchModule(fileAtPath: filePath) { return module } else { fatalError("Can't find the model file!") } ``` ### Unit test Add `test/cpp/lite_interpreter`, with one unit test `test_cores.cpp` and a light model `sequence.ptl` to test `_load_for_mobile()`, `bc.find_method()` and `bc.forward()` functions. ### Size: **With the change:** Android: x86: `pytorch_android-release.aar` (**13.8 MB**) IOS: `pytorch/build_ios/install/lib` (lib: **66 MB**): ``` (base) chenlai@chenlai-mp lib % ls -lh total 135016 -rw-r--r-- 1 chenlai staff 3.3M Feb 15 20:45 libXNNPACK.a -rw-r--r-- 1 chenlai staff 965K Feb 15 20:45 libc10.a -rw-r--r-- 1 chenlai staff 4.6K Feb 15 20:45 libclog.a -rw-r--r-- 1 chenlai staff 42K Feb 15 20:45 libcpuinfo.a -rw-r--r-- 1 chenlai staff 39K Feb 15 20:45 libcpuinfo_internals.a -rw-r--r-- 1 chenlai staff 1.5M Feb 15 20:45 libeigen_blas.a -rw-r--r-- 1 chenlai staff 148K Feb 15 20:45 libfmt.a -rw-r--r-- 1 chenlai staff 44K Feb 15 20:45 libpthreadpool.a -rw-r--r-- 1 chenlai staff 166K Feb 15 20:45 libpytorch_qnnpack.a -rw-r--r-- 1 chenlai staff 384B Feb 15 21:19 libtorch.a -rw-r--r-- 1 chenlai staff **60M** Feb 15 20:47 libtorch_cpu.a ``` `pytorch/build_ios/install`: ``` (base) chenlai@chenlai-mp install % du -sh * 14M include 66M lib 2.8M share ``` **Master (baseline):** Android: x86: `pytorch_android-release.aar` (**16.2 MB**) IOS: `pytorch/build_ios/install/lib` (lib: **84 MB**): ``` (base) chenlai@chenlai-mp lib % ls -lh total 172032 -rw-r--r-- 1 chenlai staff 3.3M Feb 17 22:18 libXNNPACK.a -rw-r--r-- 1 chenlai staff 969K Feb 17 22:18 libc10.a -rw-r--r-- 1 chenlai staff 4.6K Feb 17 22:18 libclog.a -rw-r--r-- 1 chenlai staff 42K Feb 17 22:18 libcpuinfo.a -rw-r--r-- 1 chenlai staff 1.5M Feb 17 22:18 libeigen_blas.a -rw-r--r-- 1 chenlai staff 44K Feb 17 22:18 libpthreadpool.a -rw-r--r-- 1 chenlai staff 166K Feb 17 22:18 libpytorch_qnnpack.a -rw-r--r-- 1 chenlai staff 384B Feb 17 22:19 libtorch.a -rw-r--r-- 1 chenlai staff 78M Feb 17 22:19 libtorch_cpu.a ``` `pytorch/build_ios/install`: ``` (base) chenlai@chenlai-mp install % du -sh * 14M include 84M lib 2.8M share ``` Test Plan: Imported from OSS Reviewed By: iseeyuan Differential Revision: D26518778 Pulled By: cccclai fbshipit-source-id: 4503ffa1f150ecc309ed39fb0549e8bd046a3f9c
2021-02-21 09:41:55 +00:00
option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
cmake_dependent_option(
BUILD_CAFFE2_OPS "Build Caffe2 operators" ON
"BUILD_CAFFE2" OFF)
cmake_dependent_option(
BUILD_CAFFE2_MOBILE "Build libcaffe2 for mobile (deprecating)" OFF
"BUILD_CAFFE2" OFF)
option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
cmake_dependent_option(
CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
"BUILD_SHARED_LIBS AND BUILD_CUSTOM_PROTOBUF" OFF)
cmake_dependent_option(
CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON
"NOT BUILD_SHARED_LIBS" OFF)
option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
option(BUILD_STATIC_RUNTIME_BENCHMARK "Build C++ binaries for static runtime benchmarks (need gbenchmark)" OFF)
option(BUILD_TENSOREXPR_BENCHMARK "Build C++ binaries for tensorexpr benchmarks (need gbenchmark)" OFF)
option(BUILD_MOBILE_BENCHMARK "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
option(BUILD_MOBILE_TEST "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
option(BUILD_JNI "Build JNI bindings" OFF)
option(BUILD_MOBILE_AUTOGRAD "Build autograd function in mobile build (in development)" OFF)
cmake_dependent_option(
INSTALL_TEST "Install test binaries if BUILD_TEST is on" ON
"BUILD_TEST" OFF)
option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
option(COLORIZE_OUTPUT "Colorize output during compilation" ON)
option(USE_ASAN "Use Address Sanitizer" OFF)
option(USE_TSAN "Use Thread Sanitizer" OFF)
option(USE_CUDA "Use CUDA" ON)
Add build option to split torch_cuda library into torch_cuda_cu and torch_cuda_cpp (#49050) Summary: Because of the size of our `libtorch_cuda.so`, linking with other hefty binaries presents a problem where 32bit relocation markers are too small and end up overflowing. This PR attempts to break up `torch_cuda` into `torch_cuda_cu` and `torch_cuda_cpp`. `torch_cuda_cu`: all the files previously in `Caffe2_GPU_SRCS` that are * pure `.cu` files in `aten`match * all the BLAS files * all the THC files, except for THCAllocator.cpp, THCCachingHostAllocator.cpp and THCGeneral.cpp * all files in`detail` * LegacyDefinitions.cpp and LegacyTHFunctionsCUDA.cpp * Register*CUDA.cpp * CUDAHooks.cpp * CUDASolver.cpp * TensorShapeCUDA.cpp `torch_cuda_cpp`: all other files in `Caffe2_GPU_SRCS` Accordingly, TORCH_CUDA_API and TORCH_CUDA_BUILD_MAIN_LIB usages are getting split as well to TORCH_CUDA_CU_API and TORCH_CUDA_CPP_API. To test this locally, you can run `export BUILD_SPLIT_CUDA=ON && python setup.py develop`. In your `build/lib` folder, you should find binaries for both `torch_cuda_cpp` and `torch_cuda_cu`. To see that the SPLIT_CUDA option was toggled, you can grep the Summary of running cmake and make sure `Split CUDA` is ON. This build option is tested on CI for CUDA 11.1 builds (linux for now, but windows soon). Pull Request resolved: https://github.com/pytorch/pytorch/pull/49050 Reviewed By: walterddr Differential Revision: D26114310 Pulled By: janeyx99 fbshipit-source-id: 0180f2519abb5a9cdde16a6fb7dd3171cff687a6
2021-02-02 02:36:32 +00:00
# BUILD_SPLIT_CUDA must also be exported as an environment variable before building, with
# `export BUILD_SPLIT_CUDA=1` because cpp_extension.py can only work properly if this variable
# also exists in the environment.
# This option is incompatible with CUDA_SEPARABLE_COMPILATION.
cmake_dependent_option(
BUILD_SPLIT_CUDA "Split torch_cuda library into torch_cuda_cu and torch_cuda_cpp" OFF
"USE_CUDA AND NOT CUDA_SEPARABLE_COMPILATION" OFF)
initial commit to enable fast_nvcc (#49773) Summary: draft enable fast_nvcc. * cleaned up some non-standard usages * added fall-back to wrap_nvcc Pull Request resolved: https://github.com/pytorch/pytorch/pull/49773 Test Plan: Configuration to enable fast nvcc: - install and enable `ccache` but delete `.ccache/` folder before each build. - `TORCH_CUDA_ARCH_LIST=6.0;6.1;6.2;7.0;7.5` - Toggling `USE_FAST_NVCC=ON/OFF` cmake config and run `cmake --build` to verify the build time. Initial statistic for a full compilation: * `cmake --build . -- -j $(nproc)`: - fast NVCC ``` real 48m55.706s user 1559m14.218s sys 318m41.138s ``` - normal NVCC: ``` real 43m38.723s user 1470m28.131s sys 90m46.879s ``` * `cmake --build . -- -j $(nproc/4)`: - fast NVCC: ``` real 53m44.173s user 1130m18.323s sys 71m32.385s ``` - normal NVCC: ``` real 81m53.768s user 858m45.402s sys 61m15.539s ``` * Conclusion: fast NVCC doesn't provide too much gain when compiler is set to use full CPU utilization, in fact it is **even worse** because of the thread switcing. initial statistic for partial recompile (edit .cu files) * `cmake --build . -- -j $(nproc)` - fast NVCC: ``` [2021-01-13 18:10:24] [ 86%] Building NVCC (Device) object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/torch_cuda_generated_BinaryMiscOpsKernels.cu.o [2021-01-13 18:11:08] [ 86%] Linking CXX shared library ../lib/libtorch_cuda.so ``` - normal NVCC: ``` [2021-01-13 17:35:40] [ 86%] Building NVCC (Device) object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/torch_cuda_generated_BinaryMiscOpsKernels.cu.o [2021-01-13 17:38:08] [ 86%] Linking CXX shared library ../lib/libtorch_cuda.so ``` * Conclusion: Effective compilation time for single CU file modification reduced from from 2min30sec to only 40sec when compiling multiple architecture. This shows **4X** gain in speed up using fast NVCC -- reaching the theoretical limit of 5X when compiling 5 gencode architecture at the same time. Follow up PRs: - should have better fallback mechanism to detect whether a build is supported by fast_nvcc or not instead of dryruning then fail with fallback. - performance measurement instrumentation to measure what's the total compile time vs the parallel tasks critical path time. - figure out why `-j $(nproc)` gives significant sys overhead (`sys 318m41.138s` vs `sys 90m46.879s`) over normal nvcc, guess this is context switching, but not exactly sure Reviewed By: malfet Differential Revision: D25692758 Pulled By: walterddr fbshipit-source-id: c244d07b9b71f146e972b6b3682ca792b38c4457
2021-01-19 22:48:44 +00:00
option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
option(USE_ROCM "Use ROCm" ON)
option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
cmake_dependent_option(
USE_CUDNN "Use cuDNN" ON
[build] Have PyTorch depend on minimal libcaffe2.so instead of libATen.so (#7399) * Have PyTorch depend on minimal libcaffe2.so instead of libATen.so * Build ATen tests as a part of Caffe2 build * Hopefully cufft and nvcc fPIC fixes * Make ATen install components optional * Add tests back for ATen and fix TH build * Fixes for test_install.sh script * Fixes for cpp_build/build_all.sh * Fixes for aten/tools/run_tests.sh * Switch ATen cmake calls to USE_CUDA instead of NO_CUDA * Attempt at fix for aten/tools/run_tests.sh * Fix typo in last commit * Fix valgrind call after pushd * Be forgiving about USE_CUDA disable like PyTorch * More fixes on the install side * Link all libcaffe2 during test run * Make cuDNN optional for ATen right now * Potential fix for non-CUDA builds * Use NCCL_ROOT_DIR environment variable * Pass -fPIC through nvcc to base compiler/linker * Remove THCUNN.h requirement for libtorch gen * Add Mac test for -Wmaybe-uninitialized * Potential Windows and Mac fixes * Move MSVC target props to shared function * Disable cpp_build/libtorch tests on Mac * Disable sleef for Windows builds * Move protos under BUILD_CAFFE2 * Remove space from linker flags passed with -Wl * Remove ATen from Caffe2 dep libs since directly included * Potential Windows fixes * Preserve options while sleef builds * Force BUILD_SHARED_LIBS flag for Caffe2 builds * Set DYLD_LIBRARY_PATH and LD_LIBRARY_PATH for Mac testing * Pass TORCH_CUDA_ARCH_LIST directly in cuda.cmake * Fixes for the last two changes * Potential fix for Mac build failure * Switch Caffe2 to build_caffe2 dir to not conflict * Cleanup FindMKL.cmake * Another attempt at Mac cpp_build fix * Clear cpp-build directory for Mac builds * Disable test in Mac build/test to match cmake
2018-05-24 14:47:27 +00:00
"USE_CUDA" OFF)
cmake_dependent_option(
USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
"USE_CUDNN" OFF)
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
option(USE_KINETO "Use Kineto profiling library" ON)
option(USE_CUPTI_SO "Use CUPTI as a shared library" OFF)
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
option(USE_FFMPEG "Use ffmpeg" OFF)
option(USE_GFLAGS "Use GFLAGS" OFF)
option(USE_GLOG "Use GLOG" OFF)
option(USE_LEVELDB "Use LEVELDB" OFF)
option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
option(USE_LMDB "Use LMDB" OFF)
option(USE_METAL "Use Metal for Caffe2 iOS build" ON)
option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(
USE_MLCOMPUTE "Use ML Compute for macOS build" ON
"MLCOMPUTE_FOUND" OFF)
Let CMake handle NCCL detection instead of our handcrafted Python script. (#22930) Summary: --- How does the current code subsume all detections in the deleted `nccl.py`? - The dependency of `USE_NCCL` on the OS and `USE_CUDA` is handled as dependency options in `CMakeLists.txt`. - The main NCCL detection happens in [FindNCCL.cmake](https://github.com/pytorch/pytorch/blob/8377d4b32c12206a0f9401e81a5e5796c8fc01a8/cmake/Modules/FindNCCL.cmake), which is called by [nccl.cmake](https://github.com/pytorch/pytorch/blob/8377d4b32c12206a0f9401e81a5e5796c8fc01a8/cmake/External/nccl.cmake). When `USE_SYSTEM_NCCL` is false, the previous Python code defer the detection to `find_package(NCCL)`. The change in `nccl.cmake` retains this. - `USE_STATIC_NCCL` in the previous Python code simply changes the name of the detected library. This is done in `IF (USE_STATIC_NCCL)`. - Now we only need to look at how the lines below line 20 in `nccl.cmake` are subsumed. These lines list paths to header and library directories that NCCL headers and libraries may reside in and try to search these directories for the key header and library files in turn. These are done by `find_path` for headers and `find_library` for the library files in `FindNCCL.cmake`. * The call of [find_path](https://cmake.org/cmake/help/v3.8/command/find_path.html) (Search for `NO_DEFAULT_PATH` in the link) by default searches for headers in `<prefix>/include` for each `<prefix>` in `CMAKE_PREFIX_PATH` and `CMAKE_SYSTEM_PREFIX_PATH`. Like the Python code, this commit sets `CMAKE_PREFIX_PATH` to search for `<prefix>` in `NCCL_ROOT_DIR` and home to CUDA. `CMAKE_SYSTEM_PREFIX_PATH` includes the standard directories such as `/usr/local` and `/usr`. `NCCL_INCLUDE_DIR` is also specifically handled. * Similarly, the call of [find_library](https://cmake.org/cmake/help/v3.8/command/find_library.html) (Search for `NO_DEFAULT_PATH` in the link) by default searches for libraries in directories including `<prefix>/lib` for each `<prefix>` in `CMAKE_PREFIX_PATH` and `CMAKE_SYSTEM_PREFIX_PATH`. But it also handles the edge cases intended to be solved in the Python code more properly: - It only searches for `<prefix>/lib64` (and `<prefix>/lib32`) if it is appropriate on the system. - It only searches for `<prefix>/lib/<arch>` for the right `<arch>`, unlike the Python code searches for `lib/<arch>` in a generic way (e.g., the Python code searches for `/usr/lib/x86_64-linux-gnu` but in reality systems have `/usr/lib/x86_64-some-customized-name-linux-gnu`, see https://unix.stackexchange.com/a/226180/38242 ). --- Regarding for relevant issues: - https://github.com/pytorch/pytorch/issues/12063 and https://github.com/pytorch/pytorch/issues/2877: These are properly handled, as explained in the updated comment. - https://github.com/pytorch/pytorch/issues/2941 does not changes NCCL detection specifically for Windows (it changed CUDA detection). - b7e258f81ef61d19b884194cdbcd6c7089636d46 A versioned library detection is added, but the order is reversed: The unversioned library becomes preferred. This is because normally unversioned libraries are linked to versioned libraries and preferred by users, and local installation by users are often unversioned. Like the document of [find_library](https://cmake.org/cmake/help/v3.8/command/find_library.html) suggests: > When using this to specify names with and without a version suffix, we recommend specifying the unversioned name first so that locally-built packages can be found before those provided by distributions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/22930 Differential Revision: D16440275 Pulled By: ezyang fbshipit-source-id: 11fe80743d4fe89b1ed6f96d5d996496e8ec01aa
2019-07-23 15:35:59 +00:00
cmake_dependent_option(
USE_NCCL "Use NCCL" ON
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON
USE_NCCL OFF)
Let CMake handle NCCL detection instead of our handcrafted Python script. (#22930) Summary: --- How does the current code subsume all detections in the deleted `nccl.py`? - The dependency of `USE_NCCL` on the OS and `USE_CUDA` is handled as dependency options in `CMakeLists.txt`. - The main NCCL detection happens in [FindNCCL.cmake](https://github.com/pytorch/pytorch/blob/8377d4b32c12206a0f9401e81a5e5796c8fc01a8/cmake/Modules/FindNCCL.cmake), which is called by [nccl.cmake](https://github.com/pytorch/pytorch/blob/8377d4b32c12206a0f9401e81a5e5796c8fc01a8/cmake/External/nccl.cmake). When `USE_SYSTEM_NCCL` is false, the previous Python code defer the detection to `find_package(NCCL)`. The change in `nccl.cmake` retains this. - `USE_STATIC_NCCL` in the previous Python code simply changes the name of the detected library. This is done in `IF (USE_STATIC_NCCL)`. - Now we only need to look at how the lines below line 20 in `nccl.cmake` are subsumed. These lines list paths to header and library directories that NCCL headers and libraries may reside in and try to search these directories for the key header and library files in turn. These are done by `find_path` for headers and `find_library` for the library files in `FindNCCL.cmake`. * The call of [find_path](https://cmake.org/cmake/help/v3.8/command/find_path.html) (Search for `NO_DEFAULT_PATH` in the link) by default searches for headers in `<prefix>/include` for each `<prefix>` in `CMAKE_PREFIX_PATH` and `CMAKE_SYSTEM_PREFIX_PATH`. Like the Python code, this commit sets `CMAKE_PREFIX_PATH` to search for `<prefix>` in `NCCL_ROOT_DIR` and home to CUDA. `CMAKE_SYSTEM_PREFIX_PATH` includes the standard directories such as `/usr/local` and `/usr`. `NCCL_INCLUDE_DIR` is also specifically handled. * Similarly, the call of [find_library](https://cmake.org/cmake/help/v3.8/command/find_library.html) (Search for `NO_DEFAULT_PATH` in the link) by default searches for libraries in directories including `<prefix>/lib` for each `<prefix>` in `CMAKE_PREFIX_PATH` and `CMAKE_SYSTEM_PREFIX_PATH`. But it also handles the edge cases intended to be solved in the Python code more properly: - It only searches for `<prefix>/lib64` (and `<prefix>/lib32`) if it is appropriate on the system. - It only searches for `<prefix>/lib/<arch>` for the right `<arch>`, unlike the Python code searches for `lib/<arch>` in a generic way (e.g., the Python code searches for `/usr/lib/x86_64-linux-gnu` but in reality systems have `/usr/lib/x86_64-some-customized-name-linux-gnu`, see https://unix.stackexchange.com/a/226180/38242 ). --- Regarding for relevant issues: - https://github.com/pytorch/pytorch/issues/12063 and https://github.com/pytorch/pytorch/issues/2877: These are properly handled, as explained in the updated comment. - https://github.com/pytorch/pytorch/issues/2941 does not changes NCCL detection specifically for Windows (it changed CUDA detection). - b7e258f81ef61d19b884194cdbcd6c7089636d46 A versioned library detection is added, but the order is reversed: The unversioned library becomes preferred. This is because normally unversioned libraries are linked to versioned libraries and preferred by users, and local installation by users are often unversioned. Like the document of [find_library](https://cmake.org/cmake/help/v3.8/command/find_library.html) suggests: > When using this to specify names with and without a version suffix, we recommend specifying the unversioned name first so that locally-built packages can be found before those provided by distributions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/22930 Differential Revision: D16440275 Pulled By: ezyang fbshipit-source-id: 11fe80743d4fe89b1ed6f96d5d996496e8ec01aa
2019-07-23 15:35:59 +00:00
cmake_dependent_option(
USE_STATIC_NCCL "Use static NCCL" OFF
"USE_NCCL" OFF)
cmake_dependent_option(
USE_SYSTEM_NCCL "Use system-wide NCCL" OFF
"USE_NCCL" OFF)
option(USE_NNAPI "Use NNAPI" OFF)
option(USE_NNPACK "Use NNPACK" ON)
cmake_dependent_option(
USE_NUMA "Use NUMA. Only available on Linux." ON
"LINUX" OFF)
cmake_dependent_option(
USE_NVRTC "Use NVRTC. Only available if USE_CUDA is on." OFF
"USE_CUDA" OFF)
option(USE_NUMPY "Use NumPy" ON)
option(USE_OBSERVERS "Use observers module." OFF)
option(USE_OPENCL "Use OpenCL" OFF)
option(USE_OPENCV "Use OpenCV" OFF)
option(USE_OPENMP "Use OpenMP for parallel code" ON)
option(USE_PROF "Use profiling" OFF)
option(USE_QNNPACK "Use QNNPACK (quantized 8-bit operators)" ON)
option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
option(USE_REDIS "Use Redis" OFF)
2018-03-01 20:01:44 +00:00
option(USE_ROCKSDB "Use RocksDB" OFF)
option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
option(USE_SYSTEM_EIGEN_INSTALL
"Use system Eigen instead of the one under third_party" OFF)
option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
cmake_dependent_option(
USE_VALGRIND "Use Valgrind. Only available on Linux." ON
"LINUX" OFF)
if(NOT DEFINED USE_VULKAN)
cmake_dependent_option(
USE_VULKAN "Use Vulkan GPU backend" ON
"ANDROID" OFF)
endif()
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF)
Mobile Backend: NHWC memory layout + XNNPACK integration. (#33722) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33722 In order to improve CPU performance on floating-point models on mobile, this PR introduces a new CPU backend for mobile that implements the most common mobile operators with NHWC memory layout support through integration with XNNPACK. XNNPACK itself, and this codepath, are currently only included in the build, but the actual integration is gated with USE_XNNPACK preprocessor guards. This preprocessor symbol is intentionally not passed on to the compiler, so as to enable this rollout in multiple stages in follow up PRs. This changeset will build XNNPACK as part of the build if the identically named USE_XNNPACK CMAKE variable, defaulted to ON, is enabled, but will not actually expose or enable this code path in any other way. Furthermore, it is worth pointing out that in order to efficiently map models to these operators, some front-end method of exposing this backend to the user is needed. The less efficient implementation would be to hook these operators into their corresponding native implementations, granted that a series of XNNPACK-specific conditions are met, much like how NNPACK is integrated with PyTorch today for instance. Having said that, while the above implementation is still expected to outperform NNPACK based on the benchmarks I ran, the above integration would be leave a considerable gap between the performance achieved and the maximum performance potential XNNPACK enables, as it does not provide a way to compute and factor out one-time operations out of the inner most forward() loop. The more optimal solution, and one we will decide on soon, would involve either providing a JIT pass that maps nn operators onto these newly introduced operators, while allowing one-time calculations to be factored out, much like quantized mobile models. Alternatively, new eager-mode modules can also be introduced that would directly call into these implementations either through c10 or some other mechanism, also allowing for decoupling of op creation from op execution. This PR does not include any of the front end changes mentioned above. Neither does it include the mobile threadpool unification present in the original https://github.com/pytorch/pytorch/issues/30644. Furthermore, this codepath seems to be faster than NNPACK in a good number of use cases, which can potentially allow us to remove NNPACK from aten to make the codebase a little simpler, granted that there is widespread support for such a move. Regardless, these changes will be introduced gradually and in a more controlled way in subsequent PRs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/32509 Test Plan: Build: CI Functionality: Not exposed Reviewed By: dreiss Differential Revision: D20069796 Pulled By: AshkanAliabadi fbshipit-source-id: d46c1c91d4bea91979ea5bd46971ced5417d309c
2020-02-25 05:53:34 +00:00
option(USE_XNNPACK "Use XNNPACK" ON)
option(USE_ZMQ "Use ZMQ" OFF)
option(USE_ZSTD "Use ZSTD" OFF)
# Ensure that an MKLDNN build is the default for x86 CPUs
# but optional for AArch64 (dependent on -DUSE_MKLDNN).
cmake_dependent_option(
USE_MKLDNN "Use MKLDNN. Only available on x86, x86_64, and AArch64." "${CPU_INTEL}"
"CPU_INTEL OR CPU_AARCH64" OFF)
set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
cmake_dependent_option(
USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF
"USE_MKLDNN" OFF)
option(USE_DISTRIBUTED "Use distributed" ON)
cmake_dependent_option(
USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
"USE_DISTRIBUTED" OFF)
cmake_dependent_option(
USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON
"USE_DISTRIBUTED" OFF)
cmake_dependent_option(
USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
"USE_DISTRIBUTED" OFF)
option(USE_TBB "Use TBB" OFF)
option(ONNX_ML "Enable traditional ONNX ML API." ON)
option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
cmake_dependent_option(
USE_DEPLOY "Build embedded torch::deploy interpreter" OFF
"BUILD_PYTHON" OFF)
cmake_dependent_option(USE_CCACHE "Attempt using CCache to wrap the compilation" ON "UNIX" OFF)
if(USE_CCACHE)
find_program(CCACHE_PROGRAM ccache)
if(CCACHE_PROGRAM)
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
else()
message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.")
endif()
endif()
# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
# On Windows platform, if user does not install libuv in build conda env and
# does not set libuv_ROOT environment variable. Set USE_DISTRIBUTED to OFF.
if(WIN32)
set(USE_TENSORPIPE OFF)
message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF")
if(USE_DISTRIBUTED AND NOT DEFINED ENV{libuv_ROOT})
find_library(
libuv_tmp_LIBRARY
NAMES uv libuv
HINTS $ENV{CONDA_PREFIX}\\Library $ENV{PREFIX}\\Library
PATH_SUFFIXES lib
NO_DEFAULT_PATH)
if(NOT libuv_tmp_LIBRARY)
set(USE_DISTRIBUTED OFF)
set(USE_GLOO OFF)
message(
WARNING "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv.")
else()
set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
endif()
endif()
endif()
# Linux distributions do not want too many embedded sources, in that sense we
# need to be able to build pytorch with an (almost) empty third_party
# directory.
# USE_SYSTEM_LIBS is a shortcut variable to toggle all the # USE_SYSTEM_*
# variables on. Individual USE_SYSTEM_* variables can be toggled with
# USE_SYSTEM_LIBS being "OFF".
option(USE_SYSTEM_LIBS "Use all available system-provided libraries." OFF)
option(USE_SYSTEM_CPUINFO "Use system-provided cpuinfo." OFF)
option(USE_SYSTEM_SLEEF "Use system-provided sleef." OFF)
option(USE_SYSTEM_GLOO "Use system-provided gloo." OFF)
option(USE_SYSTEM_FP16 "Use system-provided fp16." OFF)
option(USE_SYSTEM_PTHREADPOOL "Use system-provided pthreadpool." OFF)
option(USE_SYSTEM_PSIMD "Use system-provided psimd." OFF)
option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
option(USE_SYSTEM_BENCHMARK "Use system-provided google benchmark." OFF)
option(USE_SYSTEM_ONNX "Use system-provided onnx." OFF)
option(USE_SYSTEM_XNNPACK "Use system-provided xnnpack." OFF)
if(USE_SYSTEM_LIBS)
set(USE_SYSTEM_CPUINFO ON)
set(USE_SYSTEM_SLEEF ON)
set(USE_SYSTEM_GLOO ON)
set(BUILD_CUSTOM_PROTOBUF OFF)
set(USE_SYSTEM_EIGEN_INSTALL ON)
set(USE_SYSTEM_FP16 ON)
set(USE_SYSTEM_PTHREADPOOL ON)
set(USE_SYSTEM_PSIMD ON)
set(USE_SYSTEM_FXDIV ON)
set(USE_SYSTEM_BENCHMARK ON)
set(USE_SYSTEM_ONNX ON)
set(USE_SYSTEM_XNNPACK ON)
endif()
# Used when building Caffe2 through setup.py
option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" ON)
# /Z7 override option
# When generating debug symbols, CMake default to use the flag /Zi.
# However, it is not compatible with sccache. So we rewrite it off.
# But some users don't use sccache; this override is for them.
cmake_dependent_option(
MSVC_Z7_OVERRIDE "Work around sccache bug by replacing /Zi and /ZI with /Z7 when using MSVC (if you are not using sccache, you can turn this OFF)" ON
"MSVC" OFF)
if(NOT USE_SYSTEM_ONNX)
set(ONNX_NAMESPACE "onnx_torch" CACHE STRING "A namespace for ONNX; needed to build with other frameworks that share ONNX.")
else()
set(ONNX_NAMESPACE "onnx" CACHE STRING "A namespace for ONNX; needed to build with other frameworks that share ONNX.")
endif()
custom build script (#30144) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/30144 Create script to produce libtorch that only contains ops needed by specific models. Developers can use this workflow to further optimize mobile build size. Need keep a dummy stub for unused (stripped) ops because some JIT side logic requires certain function schemas to be existed in the JIT op registry. Test Steps: 1. Build "dump_operator_names" binary and use it to dump root ops needed by a specific model: ``` build/bin/dump_operator_names --model=mobilenetv2.pk --output=mobilenetv2.yaml ``` 2. The MobileNetV2 model should use the following ops: ``` - aten::t - aten::dropout - aten::mean.dim - aten::add.Tensor - prim::ListConstruct - aten::addmm - aten::_convolution - aten::batch_norm - aten::hardtanh_ - aten::mm ``` NOTE that for some reason it outputs "aten::addmm" but actually uses "aten::mm". You need fix it manually for now. 3. Run custom build script locally (use Android as an example): ``` SELECTED_OP_LIST=mobilenetv2.yaml scripts/build_pytorch_android.sh armeabi-v7a ``` 4. Checkout demo app that uses locally built library instead of downloading from jcenter repo: ``` git clone --single-branch --branch custom_build git@github.com:ljk53/android-demo-app.git ``` 5. Copy locally built libraries to demo app folder: ``` find ${HOME}/src/pytorch/android -name '*.aar' -exec cp {} ${HOME}/src/android-demo-app/HelloWorldApp/app/libs/ \; ``` 6. Build demo app with locally built libtorch: ``` cd ${HOME}/src/android-demo-app/HelloWorldApp ./gradlew clean && ./gradlew assembleDebug ``` 7. Install and run the demo app. In-APK arm-v7 libpytorch_jni.so build size reduced from 5.5M to 2.9M. Test Plan: Imported from OSS Differential Revision: D18612127 Pulled By: ljk53 fbshipit-source-id: fa8d5e1d3259143c7346abd1c862773be8c7e29a
2019-11-20 21:13:38 +00:00
set(SELECTED_OP_LIST "" CACHE STRING
"Path to the yaml file that contains the list of operators to include for custom build. Include all operators by default.")
set(OP_DEPENDENCY "" CACHE STRING
"Path to the yaml file that contains the op dependency graph for custom build.")
[pytorch] reintroduce static dispatch (#51957) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51957 This is a simplified version of #51554. Compared to #51554, this version only supports statically dispatching to a specific backend. The benefit is that it skipped the dispatch key computation logic thus has less framework overhead. The downside is that if input tensors do not match the specified backend it will throw error instead of falling back to regular dispatch. Sample code: ``` Tensor empty(IntArrayRef size, TensorOptions options, c10::optional<MemoryFormat> memory_format) { return at::cpu::empty(size, options, memory_format); } // aten::conj(Tensor(a) self) -> Tensor(a) Tensor conj(const Tensor & self) { return at::math::conj(self); } // aten::conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) Tensor & conj_out(Tensor & out, const Tensor & self) { return at::cpu::conj_out(out, self); } // aten::conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) Tensor & conj_outf(const Tensor & self, Tensor & out) { return at::cpu::conj_out(out, self); } // aten::_conj(Tensor self) -> Tensor Tensor _conj(const Tensor & self) { return at::defaultbackend::_conj(self); } ``` For ops without the specific backend dispatch, it will throw error: ``` // aten::_use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool bool _use_cudnn_ctc_loss(const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t blank) { TORCH_CHECK(false, "Static dispatch does not support _use_cudnn_ctc_loss for CPU."); } ``` Differential Revision: D26337857 Test Plan: Imported from OSS Reviewed By: bhosmer Pulled By: ljk53 fbshipit-source-id: a8e95799115c349de3c09f04a26b01d21a679364
2021-02-19 19:31:12 +00:00
set(STATIC_DISPATCH_BACKEND "" CACHE STRING
"Name of the backend for which static dispatch code is generated, e.g.: CPU.")
# This is a fix for a rare build issue on Ubuntu:
# symbol lookup error: miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: mkl_blas_dsyrk
# https://software.intel.com/en-us/articles/symbol-lookup-error-when-linking-intel-mkl-with-gcc-on-ubuntu
if(LINUX)
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed")
endif()
if(MSVC)
Turn off warnings on Windows CI. (#24331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/24331 Currently our logs are something like 40M a pop. Turning off warnings and turning on verbose makefiles (to see the compile commands) reduces this to more like 8M. We could probably reduce log size more but verbose makefile is really useful and we'll keep it turned on for Windows. Some findings: 1. Setting `CMAKE_VERBOSE_MAKEFILE` inside CMakelists.txt itself as suggested in https://github.com/ninja-build/ninja/issues/900#issuecomment-417917630 does not work on Windows. Setting `-DCMAKE_VERBOSE_MAKEFILE=1` does work (and we respect this environment variable.) 2. The high (`/W3`) warning level is by default on MSVC is due to cmake inserting this in the default flags. On recent versions of cmake, CMP0092 can be used to disable this flag in the default set. The string replace trick sort of works, but the standard snippet you'll find on the internet won't disable the flag from nvcc. I inspected the CUDA cmake code and verified it does respect CMP0092 3. `EHsc` is also in the default flags; this one cannot be suppressed via a policy. The string replace trick seems to work... 4. ... however, it seems nvcc implicitly inserts an `/EHs` after `-Xcompiler` specified flags, which means that if we add `/EHa` to our set of flags, you'll get a warning from nvcc. So we probably have to figure out how to exclude EHa from the nvcc flags set (EHs does seem to work fine.) 5. To suppress warnings in nvcc, you must BOTH pass `-w` and `-Xcompiler /w`. Individually these are not enough. The patch applies these things; it also fixes a bug where nvcc verbose command printing doesn't work with `-GNinja`. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D17131746 Pulled By: ezyang fbshipit-source-id: fb142f8677072a5430664b28155373088f074c4b
2019-08-30 14:09:30 +00:00
foreach(flag_var
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
# Replace /Zi and /ZI with /Z7
if(MSVC_Z7_OVERRIDE)
if(${flag_var} MATCHES "/Z[iI]")
string(REGEX REPLACE "/Z[iI]" "/Z7" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/Z[iI]")
Turn off warnings on Windows CI. (#24331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/24331 Currently our logs are something like 40M a pop. Turning off warnings and turning on verbose makefiles (to see the compile commands) reduces this to more like 8M. We could probably reduce log size more but verbose makefile is really useful and we'll keep it turned on for Windows. Some findings: 1. Setting `CMAKE_VERBOSE_MAKEFILE` inside CMakelists.txt itself as suggested in https://github.com/ninja-build/ninja/issues/900#issuecomment-417917630 does not work on Windows. Setting `-DCMAKE_VERBOSE_MAKEFILE=1` does work (and we respect this environment variable.) 2. The high (`/W3`) warning level is by default on MSVC is due to cmake inserting this in the default flags. On recent versions of cmake, CMP0092 can be used to disable this flag in the default set. The string replace trick sort of works, but the standard snippet you'll find on the internet won't disable the flag from nvcc. I inspected the CUDA cmake code and verified it does respect CMP0092 3. `EHsc` is also in the default flags; this one cannot be suppressed via a policy. The string replace trick seems to work... 4. ... however, it seems nvcc implicitly inserts an `/EHs` after `-Xcompiler` specified flags, which means that if we add `/EHa` to our set of flags, you'll get a warning from nvcc. So we probably have to figure out how to exclude EHa from the nvcc flags set (EHs does seem to work fine.) 5. To suppress warnings in nvcc, you must BOTH pass `-w` and `-Xcompiler /w`. Individually these are not enough. The patch applies these things; it also fixes a bug where nvcc verbose command printing doesn't work with `-GNinja`. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D17131746 Pulled By: ezyang fbshipit-source-id: fb142f8677072a5430664b28155373088f074c4b
2019-08-30 14:09:30 +00:00
endif(MSVC_Z7_OVERRIDE)
# Turn off warnings on Windows. In an ideal world we'd be warning
# clean on Windows too, but this is too much work for our
# non-Windows developers.
#
# NB: Technically, this is not necessary if CMP0092 was applied
# properly, but only cmake >= 3.15 has this policy, so we nail
# it one more time just be safe.
#
# NB2: This is NOT enough to prevent warnings from nvcc on MSVC. At the
# moment only CMP0092 is enough to prevent those warnings too.
string(REPLACE "/W3" "" ${flag_var} "${${flag_var}}")
# Turn off warnings (Windows build is currently is extremely warning
# unclean and the warnings aren't telling us anything useful.)
string(APPEND ${flag_var} " /w")
Turn off warnings on Windows CI. (#24331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/24331 Currently our logs are something like 40M a pop. Turning off warnings and turning on verbose makefiles (to see the compile commands) reduces this to more like 8M. We could probably reduce log size more but verbose makefile is really useful and we'll keep it turned on for Windows. Some findings: 1. Setting `CMAKE_VERBOSE_MAKEFILE` inside CMakelists.txt itself as suggested in https://github.com/ninja-build/ninja/issues/900#issuecomment-417917630 does not work on Windows. Setting `-DCMAKE_VERBOSE_MAKEFILE=1` does work (and we respect this environment variable.) 2. The high (`/W3`) warning level is by default on MSVC is due to cmake inserting this in the default flags. On recent versions of cmake, CMP0092 can be used to disable this flag in the default set. The string replace trick sort of works, but the standard snippet you'll find on the internet won't disable the flag from nvcc. I inspected the CUDA cmake code and verified it does respect CMP0092 3. `EHsc` is also in the default flags; this one cannot be suppressed via a policy. The string replace trick seems to work... 4. ... however, it seems nvcc implicitly inserts an `/EHs` after `-Xcompiler` specified flags, which means that if we add `/EHa` to our set of flags, you'll get a warning from nvcc. So we probably have to figure out how to exclude EHa from the nvcc flags set (EHs does seem to work fine.) 5. To suppress warnings in nvcc, you must BOTH pass `-w` and `-Xcompiler /w`. Individually these are not enough. The patch applies these things; it also fixes a bug where nvcc verbose command printing doesn't work with `-GNinja`. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D17131746 Pulled By: ezyang fbshipit-source-id: fb142f8677072a5430664b28155373088f074c4b
2019-08-30 14:09:30 +00:00
if(${CAFFE2_USE_MSVC_STATIC_RUNTIME})
Turn off warnings on Windows CI. (#24331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/24331 Currently our logs are something like 40M a pop. Turning off warnings and turning on verbose makefiles (to see the compile commands) reduces this to more like 8M. We could probably reduce log size more but verbose makefile is really useful and we'll keep it turned on for Windows. Some findings: 1. Setting `CMAKE_VERBOSE_MAKEFILE` inside CMakelists.txt itself as suggested in https://github.com/ninja-build/ninja/issues/900#issuecomment-417917630 does not work on Windows. Setting `-DCMAKE_VERBOSE_MAKEFILE=1` does work (and we respect this environment variable.) 2. The high (`/W3`) warning level is by default on MSVC is due to cmake inserting this in the default flags. On recent versions of cmake, CMP0092 can be used to disable this flag in the default set. The string replace trick sort of works, but the standard snippet you'll find on the internet won't disable the flag from nvcc. I inspected the CUDA cmake code and verified it does respect CMP0092 3. `EHsc` is also in the default flags; this one cannot be suppressed via a policy. The string replace trick seems to work... 4. ... however, it seems nvcc implicitly inserts an `/EHs` after `-Xcompiler` specified flags, which means that if we add `/EHa` to our set of flags, you'll get a warning from nvcc. So we probably have to figure out how to exclude EHa from the nvcc flags set (EHs does seem to work fine.) 5. To suppress warnings in nvcc, you must BOTH pass `-w` and `-Xcompiler /w`. Individually these are not enough. The patch applies these things; it also fixes a bug where nvcc verbose command printing doesn't work with `-GNinja`. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D17131746 Pulled By: ezyang fbshipit-source-id: fb142f8677072a5430664b28155373088f074c4b
2019-08-30 14:09:30 +00:00
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
else()
if(${flag_var} MATCHES "/MT")
string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
endif()
endif()
# /bigobj increases number of sections in .obj file, which is needed to link
# against libraries in Python 2.7 under Windows
# For Visual Studio generators, if /MP is not added, then we may need
Correct /MP usage in MSVC (#33120) Summary: ## Several flags `/MP[M]`: It is a flag for the compiler `cl`. It leads to object-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC. `/maxcpucount:[M]`: It is a flag for the generator `msbuild`. It leads to project-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC. `/p:CL_MPCount=[M]`: It is a flag for the generator `msbuild`. It leads the generator to pass `/MP[M]` to the compiler. `/j[M]`: It is a flag for the generator `ninja`. It leads to object-level multiprocessing. By default, it spawns M processes where M is the number of cores on the PC. ## Reason for the change 1. Object-level multiprocessing is preferred over project-level multiprocessing. 2. ~For ninja, we don't need to set `/MP` otherwise M * M processes will be spawned.~ Actually, it is not correct because in ninja configs, there are only one source file in the command. Therefore, the `/MP` switch should be useless. 3. For msbuild, if it is called through Python configuration scripts, then `/p:CL_MPCount=[M]` will be added, otherwise, we add `/MP` to `CMAKE_CXX_FLAGS`. 4. ~It may be a possible fix for https://github.com/pytorch/pytorch/issues/28271, https://github.com/pytorch/pytorch/issues/27463 and https://github.com/pytorch/pytorch/issues/25393. Because `/MP` is also passed to `nvcc`.~ It is probably not true. Because `/MP` should not be effective given there is only one source file per command. ## Reference 1. https://docs.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=vs-2019 2. https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows 3. https://blog.kitware.com/cmake-building-with-all-your-cores/ Pull Request resolved: https://github.com/pytorch/pytorch/pull/33120 Differential Revision: D19817227 Pulled By: ezyang fbshipit-source-id: f8d01f835016971729c7a8d8a0d1cb8a8c2c6a5f
2020-02-10 19:26:19 +00:00
# to add /MP to the flags.
# For other generators like ninja, we don't need to add /MP because it is
# already handled by the generator itself.
if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT ${flag_var} MATCHES "/MP")
set(${flag_var} "${${flag_var}} /MP /bigobj")
else()
set(${flag_var} "${${flag_var}} /bigobj")
endif()
Turn off warnings on Windows CI. (#24331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/24331 Currently our logs are something like 40M a pop. Turning off warnings and turning on verbose makefiles (to see the compile commands) reduces this to more like 8M. We could probably reduce log size more but verbose makefile is really useful and we'll keep it turned on for Windows. Some findings: 1. Setting `CMAKE_VERBOSE_MAKEFILE` inside CMakelists.txt itself as suggested in https://github.com/ninja-build/ninja/issues/900#issuecomment-417917630 does not work on Windows. Setting `-DCMAKE_VERBOSE_MAKEFILE=1` does work (and we respect this environment variable.) 2. The high (`/W3`) warning level is by default on MSVC is due to cmake inserting this in the default flags. On recent versions of cmake, CMP0092 can be used to disable this flag in the default set. The string replace trick sort of works, but the standard snippet you'll find on the internet won't disable the flag from nvcc. I inspected the CUDA cmake code and verified it does respect CMP0092 3. `EHsc` is also in the default flags; this one cannot be suppressed via a policy. The string replace trick seems to work... 4. ... however, it seems nvcc implicitly inserts an `/EHs` after `-Xcompiler` specified flags, which means that if we add `/EHa` to our set of flags, you'll get a warning from nvcc. So we probably have to figure out how to exclude EHa from the nvcc flags set (EHs does seem to work fine.) 5. To suppress warnings in nvcc, you must BOTH pass `-w` and `-Xcompiler /w`. Individually these are not enough. The patch applies these things; it also fixes a bug where nvcc verbose command printing doesn't work with `-GNinja`. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D17131746 Pulled By: ezyang fbshipit-source-id: fb142f8677072a5430664b28155373088f074c4b
2019-08-30 14:09:30 +00:00
endforeach(flag_var)
foreach(flag_var
CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
if(${flag_var} MATCHES "/Z[iI7]")
string(REGEX REPLACE "/Z[iI7]" "" ${flag_var} "${${flag_var}}")
endif()
endforeach(flag_var)
foreach(flag_var
CMAKE_SHARED_LINKER_FLAGS_DEBUG CMAKE_STATIC_LINKER_FLAGS_DEBUG
CMAKE_EXE_LINKER_FLAGS_DEBUG CMAKE_MODULE_LINKER_FLAGS_DEBUG)
Turn off warnings on Windows CI. (#24331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/24331 Currently our logs are something like 40M a pop. Turning off warnings and turning on verbose makefiles (to see the compile commands) reduces this to more like 8M. We could probably reduce log size more but verbose makefile is really useful and we'll keep it turned on for Windows. Some findings: 1. Setting `CMAKE_VERBOSE_MAKEFILE` inside CMakelists.txt itself as suggested in https://github.com/ninja-build/ninja/issues/900#issuecomment-417917630 does not work on Windows. Setting `-DCMAKE_VERBOSE_MAKEFILE=1` does work (and we respect this environment variable.) 2. The high (`/W3`) warning level is by default on MSVC is due to cmake inserting this in the default flags. On recent versions of cmake, CMP0092 can be used to disable this flag in the default set. The string replace trick sort of works, but the standard snippet you'll find on the internet won't disable the flag from nvcc. I inspected the CUDA cmake code and verified it does respect CMP0092 3. `EHsc` is also in the default flags; this one cannot be suppressed via a policy. The string replace trick seems to work... 4. ... however, it seems nvcc implicitly inserts an `/EHs` after `-Xcompiler` specified flags, which means that if we add `/EHa` to our set of flags, you'll get a warning from nvcc. So we probably have to figure out how to exclude EHa from the nvcc flags set (EHs does seem to work fine.) 5. To suppress warnings in nvcc, you must BOTH pass `-w` and `-Xcompiler /w`. Individually these are not enough. The patch applies these things; it also fixes a bug where nvcc verbose command printing doesn't work with `-GNinja`. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D17131746 Pulled By: ezyang fbshipit-source-id: fb142f8677072a5430664b28155373088f074c4b
2019-08-30 14:09:30 +00:00
# Switch off incremental linking in debug builds
if(${flag_var} MATCHES "/INCREMENTAL" AND NOT ${flag_var} MATCHES "/INCREMENTAL:NO")
string(REGEX REPLACE "/INCREMENTAL" "/INCREMENTAL:NO" ${flag_var} "${${flag_var}}")
endif()
endforeach(flag_var)
Turn off warnings on Windows CI. (#24331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/24331 Currently our logs are something like 40M a pop. Turning off warnings and turning on verbose makefiles (to see the compile commands) reduces this to more like 8M. We could probably reduce log size more but verbose makefile is really useful and we'll keep it turned on for Windows. Some findings: 1. Setting `CMAKE_VERBOSE_MAKEFILE` inside CMakelists.txt itself as suggested in https://github.com/ninja-build/ninja/issues/900#issuecomment-417917630 does not work on Windows. Setting `-DCMAKE_VERBOSE_MAKEFILE=1` does work (and we respect this environment variable.) 2. The high (`/W3`) warning level is by default on MSVC is due to cmake inserting this in the default flags. On recent versions of cmake, CMP0092 can be used to disable this flag in the default set. The string replace trick sort of works, but the standard snippet you'll find on the internet won't disable the flag from nvcc. I inspected the CUDA cmake code and verified it does respect CMP0092 3. `EHsc` is also in the default flags; this one cannot be suppressed via a policy. The string replace trick seems to work... 4. ... however, it seems nvcc implicitly inserts an `/EHs` after `-Xcompiler` specified flags, which means that if we add `/EHa` to our set of flags, you'll get a warning from nvcc. So we probably have to figure out how to exclude EHa from the nvcc flags set (EHs does seem to work fine.) 5. To suppress warnings in nvcc, you must BOTH pass `-w` and `-Xcompiler /w`. Individually these are not enough. The patch applies these things; it also fixes a bug where nvcc verbose command printing doesn't work with `-GNinja`. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D17131746 Pulled By: ezyang fbshipit-source-id: fb142f8677072a5430664b28155373088f074c4b
2019-08-30 14:09:30 +00:00
foreach(flag_var
CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS)
string(APPEND ${flag_var} " /ignore:4049 /ignore:4217")
endforeach(flag_var)
# Try harder
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "/w" "-w")
endif(MSVC)
list(APPEND CUDA_NVCC_FLAGS "-Xfatbin" "-compress-all")
list(APPEND CUDA_NVCC_FLAGS_DEBUG "-Xfatbin" "-compress-all")
list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBINFO "-Xfatbin" "-compress-all")
if(NOT MSVC)
list(APPEND CUDA_NVCC_FLAGS_DEBUG "-g" "-lineinfo" "--source-in-ptx")
list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBINFO "-g" "-lineinfo" "--source-in-ptx")
endif(NOT MSVC)
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
# applicable to mobile are disabled by this variable.
# Setting `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can
# force it to do mobile build with host toolchain - which is useful for testing
# purpose.
if(ANDROID OR IOS OR DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
set(INTERN_BUILD_MOBILE ON)
if(DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
# C10_MOBILE is derived from Android/iOS toolchain macros in
# c10/macros/Macros.h, so it needs to be explicitly set here.
string(APPEND CMAKE_CXX_FLAGS " -DC10_MOBILE")
endif()
endif()
# INTERN_BUILD_ATEN_OPS is used to control whether to build ATen/TH operators.
# It's disabled for caffe2 mobile library.
if(INTERN_BUILD_MOBILE AND BUILD_CAFFE2_MOBILE)
set(INTERN_BUILD_ATEN_OPS OFF)
else()
set(INTERN_BUILD_ATEN_OPS ON)
endif()
# BUILD_CAFFE2_MOBILE is the master switch to choose between libcaffe2 v.s. libtorch mobile build.
# When it's enabled it builds original libcaffe2 mobile library without ATen/TH ops nor TorchScript support;
# When it's disabled it builds libtorch mobile library, which contains ATen/TH ops and native support for
# TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
if(NOT BUILD_SHARED_LIBS AND NOT "${SELECTED_OP_LIST}" STREQUAL "")
string(APPEND CMAKE_CXX_FLAGS " -DNO_EXPORT")
endif()
if(BUILD_MOBILE_AUTOGRAD)
set(INTERN_DISABLE_AUTOGRAD OFF)
else()
set(INTERN_DISABLE_AUTOGRAD ON)
endif()
set(BUILD_PYTHON OFF)
set(BUILD_CAFFE2_OPS OFF)
set(USE_DISTRIBUTED OFF)
set(FEATURE_TORCH_MOBILE ON)
set(NO_API ON)
set(USE_FBGEMM OFF)
set(USE_QNNPACK OFF)
set(INTERN_DISABLE_ONNX ON)
add eigen blas for mobile build (#26508) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/26508 Enable BLAS for pytorch mobile build using Eigen BLAS. It's not most juicy optimization for typical mobile CV models as we are already using NNPACK/QNNPACK for most ops there. But it's nice to have good fallback implementation for other ops. Test Plan: - Create a simple matrix multiplication script model: ``` import torch class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() self.weights = torch.ones(1000, 1000) def forward(self, x): return torch.mm(x, self.weights) n = Net() module = torch.jit.trace_module(n, {'forward': torch.ones(1000, 1000)}) module.save('mm.pk') ``` - Before integrate with eigen blas: ``` adb shell 'cd /data/local/tmp; \ ./speed_benchmark_torch \ --model=mm.pk \ --input_dims="1000,1000" \ --input_type=float \ --warmup=5 \ --iter=5' Milliseconds per iter: 2218.52. ``` - After integrate with eigen blas: ``` adb shell 'cd /data/local/tmp; \ ./speed_benchmark_torch_eigen \ --model=mm.pk \ --input_dims="1000,1000" \ --input_type=float \ --warmup=5 \ --iter=5' Milliseconds per iter: 314.535. ``` - Improve MobileNetV2 single thread perf by ~5%: ``` adb shell 'cd /data/local/tmp; \ ./speed_benchmark_torch \ --model=mobilenetv2.pk \ --input_dims="1,3,224,224" \ --input_type=float \ --warmup=5 \ --iter=20 \ --print_output=false \ --caffe2_threadpool_force_inline=true' Milliseconds per iter: 367.055. adb shell 'cd /data/local/tmp; \ ./speed_benchmark_torch_eigen \ --model=mobilenetv2.pk \ --input_dims="1,3,224,224" \ --input_type=float \ --warmup=5 \ --iter=20 \ --print_output=false \ --caffe2_threadpool_force_inline=true' Milliseconds per iter: 348.77. ``` Differential Revision: D17489587 fbshipit-source-id: efe542db810a900f680da7ec7e60f215f58db66e
2019-09-20 22:42:35 +00:00
set(INTERN_USE_EIGEN_BLAS ON)
# Disable developing mobile interpreter for actual mobile build.
# Enable it elsewhere to capture build error.
set(INTERN_DISABLE_MOBILE_INTERP ON)
endif()
# ---[ Utils
include(cmake/public/utils.cmake)
# ---[ Version numbers for generated libraries
file(READ version.txt TORCH_DEFAULT_VERSION)
# Strip trailing newline
string(REGEX REPLACE "\n$" "" TORCH_DEFAULT_VERSION "${TORCH_DEFAULT_VERSION}")
if("${TORCH_DEFAULT_VERSION} " STREQUAL " ")
message(WARNING "Could not get version from base 'version.txt'")
# If we can't get the version from the version file we should probably
# set it to something non-sensical like 0.0.0
set(TORCH_DEFAULT_VERSION, "0.0.0")
endif()
set(TORCH_BUILD_VERSION "${TORCH_DEFAULT_VERSION}" CACHE STRING "Torch build version")
if(DEFINED ENV{PYTORCH_BUILD_VERSION})
set(TORCH_BUILD_VERSION "$ENV{PYTORCH_BUILD_VERSION}"
CACHE STRING "Torch build version" FORCE)
endif()
if(NOT TORCH_BUILD_VERSION)
# An empty string was specified so force version to the default
set(TORCH_BUILD_VERSION "${TORCH_DEFAULT_VERSION}"
CACHE STRING "Torch build version" FORCE)
endif()
caffe2_parse_version_str(TORCH ${TORCH_BUILD_VERSION})
caffe2_parse_version_str(CAFFE2 ${TORCH_BUILD_VERSION})
set(TORCH_SOVERSION "${TORCH_VERSION_MAJOR}.${TORCH_VERSION_MINOR}")
# ---[ CMake scripts + modules
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
# ---[ CMake build directories
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
enable_testing()
[build] Have PyTorch depend on minimal libcaffe2.so instead of libATen.so (#7399) * Have PyTorch depend on minimal libcaffe2.so instead of libATen.so * Build ATen tests as a part of Caffe2 build * Hopefully cufft and nvcc fPIC fixes * Make ATen install components optional * Add tests back for ATen and fix TH build * Fixes for test_install.sh script * Fixes for cpp_build/build_all.sh * Fixes for aten/tools/run_tests.sh * Switch ATen cmake calls to USE_CUDA instead of NO_CUDA * Attempt at fix for aten/tools/run_tests.sh * Fix typo in last commit * Fix valgrind call after pushd * Be forgiving about USE_CUDA disable like PyTorch * More fixes on the install side * Link all libcaffe2 during test run * Make cuDNN optional for ATen right now * Potential fix for non-CUDA builds * Use NCCL_ROOT_DIR environment variable * Pass -fPIC through nvcc to base compiler/linker * Remove THCUNN.h requirement for libtorch gen * Add Mac test for -Wmaybe-uninitialized * Potential Windows and Mac fixes * Move MSVC target props to shared function * Disable cpp_build/libtorch tests on Mac * Disable sleef for Windows builds * Move protos under BUILD_CAFFE2 * Remove space from linker flags passed with -Wl * Remove ATen from Caffe2 dep libs since directly included * Potential Windows fixes * Preserve options while sleef builds * Force BUILD_SHARED_LIBS flag for Caffe2 builds * Set DYLD_LIBRARY_PATH and LD_LIBRARY_PATH for Mac testing * Pass TORCH_CUDA_ARCH_LIST directly in cuda.cmake * Fixes for the last two changes * Potential fix for Mac build failure * Switch Caffe2 to build_caffe2 dir to not conflict * Cleanup FindMKL.cmake * Another attempt at Mac cpp_build fix * Clear cpp-build directory for Mac builds * Disable test in Mac build/test to match cmake
2018-05-24 14:47:27 +00:00
# ---[ Build variables set within the cmake tree
include(cmake/BuildVariables.cmake)
set(CAFFE2_ALLOWLIST "" CACHE STRING "A allowlist file of files that one should build.")
[build] Have PyTorch depend on minimal libcaffe2.so instead of libATen.so (#7399) * Have PyTorch depend on minimal libcaffe2.so instead of libATen.so * Build ATen tests as a part of Caffe2 build * Hopefully cufft and nvcc fPIC fixes * Make ATen install components optional * Add tests back for ATen and fix TH build * Fixes for test_install.sh script * Fixes for cpp_build/build_all.sh * Fixes for aten/tools/run_tests.sh * Switch ATen cmake calls to USE_CUDA instead of NO_CUDA * Attempt at fix for aten/tools/run_tests.sh * Fix typo in last commit * Fix valgrind call after pushd * Be forgiving about USE_CUDA disable like PyTorch * More fixes on the install side * Link all libcaffe2 during test run * Make cuDNN optional for ATen right now * Potential fix for non-CUDA builds * Use NCCL_ROOT_DIR environment variable * Pass -fPIC through nvcc to base compiler/linker * Remove THCUNN.h requirement for libtorch gen * Add Mac test for -Wmaybe-uninitialized * Potential Windows and Mac fixes * Move MSVC target props to shared function * Disable cpp_build/libtorch tests on Mac * Disable sleef for Windows builds * Move protos under BUILD_CAFFE2 * Remove space from linker flags passed with -Wl * Remove ATen from Caffe2 dep libs since directly included * Potential Windows fixes * Preserve options while sleef builds * Force BUILD_SHARED_LIBS flag for Caffe2 builds * Set DYLD_LIBRARY_PATH and LD_LIBRARY_PATH for Mac testing * Pass TORCH_CUDA_ARCH_LIST directly in cuda.cmake * Fixes for the last two changes * Potential fix for Mac build failure * Switch Caffe2 to build_caffe2 dir to not conflict * Cleanup FindMKL.cmake * Another attempt at Mac cpp_build fix * Clear cpp-build directory for Mac builds * Disable test in Mac build/test to match cmake
2018-05-24 14:47:27 +00:00
# Set default build type
if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Build type not set - defaulting to Release")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage." FORCE)
endif()
# The below means we are cross compiling for arm64 or x86_64 on MacOSX
if(NOT IOS AND CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
set(CROSS_COMPILING_MACOSX TRUE)
# We need to compile a universal protoc to not fail protobuf build
# We set CMAKE_TRY_COMPILE_TARGET_TYPE to STATIC_LIBRARY (vs executable) to succeed the cmake compiler check for cross-compiling
set(protoc_build_command "./scripts/build_host_protoc.sh --other-flags -DCMAKE_OSX_ARCHITECTURES=\"x86_64;arm64\" -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY -DCMAKE_C_COMPILER_WORKS=1 -DCMAKE_CXX_COMPILER_WORKS=1")
# We write to a temp scriptfile because CMake COMMAND dislikes double quotes in commands
file(WRITE ${PROJECT_SOURCE_DIR}/tmp_protoc_script.sh "#!/bin/bash\n${protoc_build_command}")
file(COPY ${PROJECT_SOURCE_DIR}/tmp_protoc_script.sh DESTINATION ${PROJECT_SOURCE_DIR}/scripts/ FILE_PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ)
execute_process(COMMAND ./scripts/tmp_protoc_script.sh
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
RESULT_VARIABLE BUILD_HOST_PROTOC_RESULT)
file(REMOVE ${PROJECT_SOURCE_DIR}/tmp_protoc_script.sh ${PROJECT_SOURCE_DIR}/scripts/tmp_protoc_script.sh)
if(NOT BUILD_HOST_PROTOC_RESULT EQUAL "0")
message(FATAL_ERROR "Could not compile universal protoc.")
endif()
set(PROTOBUF_PROTOC_EXECUTABLE "${PROJECT_SOURCE_DIR}/build_host_protoc/bin/protoc")
set(CAFFE2_CUSTOM_PROTOC_EXECUTABLE "${PROJECT_SOURCE_DIR}/build_host_protoc/bin/protoc")
endif()
# ---[ Misc checks to cope with various compiler modes
include(cmake/MiscCheck.cmake)
# External projects
include(ExternalProject)
2016-12-06 16:39:15 +00:00
# ---[ Dependencies
# ---[ FBGEMM doesn't work on x86 32bit and CMAKE_SYSTEM_PROCESSOR thinks its 64bit
if(USE_FBGEMM AND ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL 4) OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86"))
set(USE_FBGEMM OFF)
endif()
2016-12-06 16:39:15 +00:00
include(cmake/Dependencies.cmake)
Direct FBGEMM integraton into ATen (#13777) Summary: This PR implements infrastructure for post-processing a model to apply int8 quantization to its `nn.Linear` modules. Highlights of the implementation: 1) Inputs and outputs are `float` (quantized and packed internally), but the weight is quantized and packed ahead of time for efficiency. This implementation performs well in small-batch size GEMM calls. It should not be considered a general-purpose quantized GEMM kernel. 2) Weight packing is dependent on machine architecture (e.g. vector register width), so it is done just-in-time. Concretely, it is done on model load for the weights and it is done during operator execution for the input value. 3) Biases are unquantized 4) We fail loudly if we are attempting to run this on a machine that does not support FBGEMM. This is because we do not want a model's numerics to differ based on which machine it is run on. A model containing these FBGEMM ops *must* be run with FBGEMM The API can be seen in the added test case. Highlights are: 1) `torch.jit.quantized.quantize_linear_modules` walks the module hierarchy of the passed-in Module and replaces all `nn.Linear` modules with a new `QuantizedLinear` module, which encapsulates the behavior described above. 2) `_pack()` and `_unpack()` script methods are present on `QuantizedLinear` modules. These methods should be called before serialization and after deserialization, respectively. This ensures that the weight matrix is properly packed for the running machine's architecture. Note that in the long term, we would like to move toward a more Pickle-style serialization technique, rather than having these explicit methods that mutate member values. This is blocked on being able to assign attributes in a ScriptMethod, among other things. Pull Request resolved: https://github.com/pytorch/pytorch/pull/13777 Differential Revision: D13383276 Pulled By: jamesr66a fbshipit-source-id: 00f29c9f34544add2b90107e3cf55a287802c344
2018-12-21 18:32:57 +00:00
if(USE_FBGEMM)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
Direct FBGEMM integraton into ATen (#13777) Summary: This PR implements infrastructure for post-processing a model to apply int8 quantization to its `nn.Linear` modules. Highlights of the implementation: 1) Inputs and outputs are `float` (quantized and packed internally), but the weight is quantized and packed ahead of time for efficiency. This implementation performs well in small-batch size GEMM calls. It should not be considered a general-purpose quantized GEMM kernel. 2) Weight packing is dependent on machine architecture (e.g. vector register width), so it is done just-in-time. Concretely, it is done on model load for the weights and it is done during operator execution for the input value. 3) Biases are unquantized 4) We fail loudly if we are attempting to run this on a machine that does not support FBGEMM. This is because we do not want a model's numerics to differ based on which machine it is run on. A model containing these FBGEMM ops *must* be run with FBGEMM The API can be seen in the added test case. Highlights are: 1) `torch.jit.quantized.quantize_linear_modules` walks the module hierarchy of the passed-in Module and replaces all `nn.Linear` modules with a new `QuantizedLinear` module, which encapsulates the behavior described above. 2) `_pack()` and `_unpack()` script methods are present on `QuantizedLinear` modules. These methods should be called before serialization and after deserialization, respectively. This ensures that the weight matrix is properly packed for the running machine's architecture. Note that in the long term, we would like to move toward a more Pickle-style serialization technique, rather than having these explicit methods that mutate member values. This is blocked on being able to assign attributes in a ScriptMethod, among other things. Pull Request resolved: https://github.com/pytorch/pytorch/pull/13777 Differential Revision: D13383276 Pulled By: jamesr66a fbshipit-source-id: 00f29c9f34544add2b90107e3cf55a287802c344
2018-12-21 18:32:57 +00:00
endif()
if(USE_QNNPACK)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_QNNPACK")
endif()
if(USE_PYTORCH_QNNPACK)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
endif()
Mobile Backend: NHWC memory layout + XNNPACK integration. (#33722) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33722 In order to improve CPU performance on floating-point models on mobile, this PR introduces a new CPU backend for mobile that implements the most common mobile operators with NHWC memory layout support through integration with XNNPACK. XNNPACK itself, and this codepath, are currently only included in the build, but the actual integration is gated with USE_XNNPACK preprocessor guards. This preprocessor symbol is intentionally not passed on to the compiler, so as to enable this rollout in multiple stages in follow up PRs. This changeset will build XNNPACK as part of the build if the identically named USE_XNNPACK CMAKE variable, defaulted to ON, is enabled, but will not actually expose or enable this code path in any other way. Furthermore, it is worth pointing out that in order to efficiently map models to these operators, some front-end method of exposing this backend to the user is needed. The less efficient implementation would be to hook these operators into their corresponding native implementations, granted that a series of XNNPACK-specific conditions are met, much like how NNPACK is integrated with PyTorch today for instance. Having said that, while the above implementation is still expected to outperform NNPACK based on the benchmarks I ran, the above integration would be leave a considerable gap between the performance achieved and the maximum performance potential XNNPACK enables, as it does not provide a way to compute and factor out one-time operations out of the inner most forward() loop. The more optimal solution, and one we will decide on soon, would involve either providing a JIT pass that maps nn operators onto these newly introduced operators, while allowing one-time calculations to be factored out, much like quantized mobile models. Alternatively, new eager-mode modules can also be introduced that would directly call into these implementations either through c10 or some other mechanism, also allowing for decoupling of op creation from op execution. This PR does not include any of the front end changes mentioned above. Neither does it include the mobile threadpool unification present in the original https://github.com/pytorch/pytorch/issues/30644. Furthermore, this codepath seems to be faster than NNPACK in a good number of use cases, which can potentially allow us to remove NNPACK from aten to make the codebase a little simpler, granted that there is widespread support for such a move. Regardless, these changes will be introduced gradually and in a more controlled way in subsequent PRs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/32509 Test Plan: Build: CI Functionality: Not exposed Reviewed By: dreiss Differential Revision: D20069796 Pulled By: AshkanAliabadi fbshipit-source-id: d46c1c91d4bea91979ea5bd46971ced5417d309c
2020-02-25 05:53:34 +00:00
if(USE_XNNPACK)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
Mobile Backend: NHWC memory layout + XNNPACK integration. (#33722) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33722 In order to improve CPU performance on floating-point models on mobile, this PR introduces a new CPU backend for mobile that implements the most common mobile operators with NHWC memory layout support through integration with XNNPACK. XNNPACK itself, and this codepath, are currently only included in the build, but the actual integration is gated with USE_XNNPACK preprocessor guards. This preprocessor symbol is intentionally not passed on to the compiler, so as to enable this rollout in multiple stages in follow up PRs. This changeset will build XNNPACK as part of the build if the identically named USE_XNNPACK CMAKE variable, defaulted to ON, is enabled, but will not actually expose or enable this code path in any other way. Furthermore, it is worth pointing out that in order to efficiently map models to these operators, some front-end method of exposing this backend to the user is needed. The less efficient implementation would be to hook these operators into their corresponding native implementations, granted that a series of XNNPACK-specific conditions are met, much like how NNPACK is integrated with PyTorch today for instance. Having said that, while the above implementation is still expected to outperform NNPACK based on the benchmarks I ran, the above integration would be leave a considerable gap between the performance achieved and the maximum performance potential XNNPACK enables, as it does not provide a way to compute and factor out one-time operations out of the inner most forward() loop. The more optimal solution, and one we will decide on soon, would involve either providing a JIT pass that maps nn operators onto these newly introduced operators, while allowing one-time calculations to be factored out, much like quantized mobile models. Alternatively, new eager-mode modules can also be introduced that would directly call into these implementations either through c10 or some other mechanism, also allowing for decoupling of op creation from op execution. This PR does not include any of the front end changes mentioned above. Neither does it include the mobile threadpool unification present in the original https://github.com/pytorch/pytorch/issues/30644. Furthermore, this codepath seems to be faster than NNPACK in a good number of use cases, which can potentially allow us to remove NNPACK from aten to make the codebase a little simpler, granted that there is widespread support for such a move. Regardless, these changes will be introduced gradually and in a more controlled way in subsequent PRs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/32509 Test Plan: Build: CI Functionality: Not exposed Reviewed By: dreiss Differential Revision: D20069796 Pulled By: AshkanAliabadi fbshipit-source-id: d46c1c91d4bea91979ea5bd46971ced5417d309c
2020-02-25 05:53:34 +00:00
endif()
[Mobile GPU][Integration] Vulkan backend integration (#36491) Summary: This PR contains the initial version of Vulkan (GPU) Backend integration. The primary target environment is Android, but the desktop build is also supported. ## CMake Introducing three cmake options: USE_VULKAN: The main switch, if it is off, all other options do not affect. USE_VULKAN_WRAPPER: ON - Vulkan will be used loading it at runtime as "libvulkan.so" using libdl, every function call is wrapped in vulkan_wrapper.h. OFF - linking with libvulkan.so directly USE_VULKAN_SHADERC_RUNTIME: ON - Shader compilation library will be linked, and shaders will be compiled runtime. OFF - Shaders will be precompiled and shader compilation library is not included. ## Codegen if `USE_VULKAN_SHADERC_RUNTIME` is ON: Shaders precompilation () starts in cmake/VulkanCodegen.cmake, which calls `aten/src/ATen/native/vulkan/gen_glsl.py` or `aten/src/ATen/native/vulkan/gen_spv.py` to include shaders source or SPIR-V bytecode inside binary as uint32_t array in spv.h,spv.cpp. if `USE_VULKAN_SHADERC_RUNTIME` is OFF: The source of shaders is included as `glsl.h`,`glsl.cpp`. All codegen results happen in the build directory. ## Build dependencies cmake/Dependencies.cmake If the target platform is Android - vulkan library, headers, Vulkan wrapper will be used from ANDROID_NDK. Desktop build requires the VULKAN_SDK environment variable, and all vulkan dependencies will be used from it. (Desktop build was tested only on Linux). ## Pytorch integration: Adding 'Vulkan" as new Backend, DispatchKey, DeviceType. We are using Strided layout without supporting strides at the moment, but we plan to support them in the future. Using OpaqueTensorImpl where OpaqueHandle is copyable VulkanTensor, more details in comments in `aten/src/ATen/native/vulkan/Vulkan.h` Main code location: `aten/src/ATen/native/vulkan` `aten/src/ATen/native/vulkan/VulkanAten.cpp` - connection link between ATen and Vulkan api (Vulkan.h) that converts at::Tensor to VulkanTensor. `aten/src/ATen/native/Vulkan/Vulkan.h` - Vulkan API that contains VulkanTensor representation and functions to work with it. Plan to expose it for clients to be able to write their own Vulkan Ops. `aten/src/ATen/native/vulkan/VulkanOps.cpp` - Vulkan Operations Implementations that uses Vulkan.h API ## GLSL shaders Located in `aten/src/ATen/native/vulkan/glsl` as *.glsl files. All shaders use Vulkan specialized constants for workgroup sizes with ids 1, 2, 3 ## Supported operations Code point: conv2d no-groups conv2d depthwise addmm upsample nearest 2d clamp hardtanh ## Testing `aten/src/ATen/test/vulkan_test.cpp` - contains tests for copy from CPU to Vulkan and back all supported operations Desktop builds supported, and testing can be done on a desktop that has Vulkan supported GPU or with installed software implementation of Vulkan, like https://github.com/google/swiftshader ## Vulkan execution The initial implementation is trivial and waits every operator's execution. Pull Request resolved: https://github.com/pytorch/pytorch/pull/36491 Differential Revision: D21696709 Pulled By: IvanKobzarev fbshipit-source-id: da3e5a770b1a1995e9465d7e81963e7de56217fa
2020-05-26 02:10:31 +00:00
if(USE_VULKAN)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN")
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_API")
if(USE_VULKAN_FP16_INFERENCE)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_FP16_INFERENCE")
endif()
[Mobile GPU][Integration] Vulkan backend integration (#36491) Summary: This PR contains the initial version of Vulkan (GPU) Backend integration. The primary target environment is Android, but the desktop build is also supported. ## CMake Introducing three cmake options: USE_VULKAN: The main switch, if it is off, all other options do not affect. USE_VULKAN_WRAPPER: ON - Vulkan will be used loading it at runtime as "libvulkan.so" using libdl, every function call is wrapped in vulkan_wrapper.h. OFF - linking with libvulkan.so directly USE_VULKAN_SHADERC_RUNTIME: ON - Shader compilation library will be linked, and shaders will be compiled runtime. OFF - Shaders will be precompiled and shader compilation library is not included. ## Codegen if `USE_VULKAN_SHADERC_RUNTIME` is ON: Shaders precompilation () starts in cmake/VulkanCodegen.cmake, which calls `aten/src/ATen/native/vulkan/gen_glsl.py` or `aten/src/ATen/native/vulkan/gen_spv.py` to include shaders source or SPIR-V bytecode inside binary as uint32_t array in spv.h,spv.cpp. if `USE_VULKAN_SHADERC_RUNTIME` is OFF: The source of shaders is included as `glsl.h`,`glsl.cpp`. All codegen results happen in the build directory. ## Build dependencies cmake/Dependencies.cmake If the target platform is Android - vulkan library, headers, Vulkan wrapper will be used from ANDROID_NDK. Desktop build requires the VULKAN_SDK environment variable, and all vulkan dependencies will be used from it. (Desktop build was tested only on Linux). ## Pytorch integration: Adding 'Vulkan" as new Backend, DispatchKey, DeviceType. We are using Strided layout without supporting strides at the moment, but we plan to support them in the future. Using OpaqueTensorImpl where OpaqueHandle is copyable VulkanTensor, more details in comments in `aten/src/ATen/native/vulkan/Vulkan.h` Main code location: `aten/src/ATen/native/vulkan` `aten/src/ATen/native/vulkan/VulkanAten.cpp` - connection link between ATen and Vulkan api (Vulkan.h) that converts at::Tensor to VulkanTensor. `aten/src/ATen/native/Vulkan/Vulkan.h` - Vulkan API that contains VulkanTensor representation and functions to work with it. Plan to expose it for clients to be able to write their own Vulkan Ops. `aten/src/ATen/native/vulkan/VulkanOps.cpp` - Vulkan Operations Implementations that uses Vulkan.h API ## GLSL shaders Located in `aten/src/ATen/native/vulkan/glsl` as *.glsl files. All shaders use Vulkan specialized constants for workgroup sizes with ids 1, 2, 3 ## Supported operations Code point: conv2d no-groups conv2d depthwise addmm upsample nearest 2d clamp hardtanh ## Testing `aten/src/ATen/test/vulkan_test.cpp` - contains tests for copy from CPU to Vulkan and back all supported operations Desktop builds supported, and testing can be done on a desktop that has Vulkan supported GPU or with installed software implementation of Vulkan, like https://github.com/google/swiftshader ## Vulkan execution The initial implementation is trivial and waits every operator's execution. Pull Request resolved: https://github.com/pytorch/pytorch/pull/36491 Differential Revision: D21696709 Pulled By: IvanKobzarev fbshipit-source-id: da3e5a770b1a1995e9465d7e81963e7de56217fa
2020-05-26 02:10:31 +00:00
if(USE_VULKAN_RELAXED_PRECISION)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_RELAXED_PRECISION")
endif()
[Mobile GPU][Integration] Vulkan backend integration (#36491) Summary: This PR contains the initial version of Vulkan (GPU) Backend integration. The primary target environment is Android, but the desktop build is also supported. ## CMake Introducing three cmake options: USE_VULKAN: The main switch, if it is off, all other options do not affect. USE_VULKAN_WRAPPER: ON - Vulkan will be used loading it at runtime as "libvulkan.so" using libdl, every function call is wrapped in vulkan_wrapper.h. OFF - linking with libvulkan.so directly USE_VULKAN_SHADERC_RUNTIME: ON - Shader compilation library will be linked, and shaders will be compiled runtime. OFF - Shaders will be precompiled and shader compilation library is not included. ## Codegen if `USE_VULKAN_SHADERC_RUNTIME` is ON: Shaders precompilation () starts in cmake/VulkanCodegen.cmake, which calls `aten/src/ATen/native/vulkan/gen_glsl.py` or `aten/src/ATen/native/vulkan/gen_spv.py` to include shaders source or SPIR-V bytecode inside binary as uint32_t array in spv.h,spv.cpp. if `USE_VULKAN_SHADERC_RUNTIME` is OFF: The source of shaders is included as `glsl.h`,`glsl.cpp`. All codegen results happen in the build directory. ## Build dependencies cmake/Dependencies.cmake If the target platform is Android - vulkan library, headers, Vulkan wrapper will be used from ANDROID_NDK. Desktop build requires the VULKAN_SDK environment variable, and all vulkan dependencies will be used from it. (Desktop build was tested only on Linux). ## Pytorch integration: Adding 'Vulkan" as new Backend, DispatchKey, DeviceType. We are using Strided layout without supporting strides at the moment, but we plan to support them in the future. Using OpaqueTensorImpl where OpaqueHandle is copyable VulkanTensor, more details in comments in `aten/src/ATen/native/vulkan/Vulkan.h` Main code location: `aten/src/ATen/native/vulkan` `aten/src/ATen/native/vulkan/VulkanAten.cpp` - connection link between ATen and Vulkan api (Vulkan.h) that converts at::Tensor to VulkanTensor. `aten/src/ATen/native/Vulkan/Vulkan.h` - Vulkan API that contains VulkanTensor representation and functions to work with it. Plan to expose it for clients to be able to write their own Vulkan Ops. `aten/src/ATen/native/vulkan/VulkanOps.cpp` - Vulkan Operations Implementations that uses Vulkan.h API ## GLSL shaders Located in `aten/src/ATen/native/vulkan/glsl` as *.glsl files. All shaders use Vulkan specialized constants for workgroup sizes with ids 1, 2, 3 ## Supported operations Code point: conv2d no-groups conv2d depthwise addmm upsample nearest 2d clamp hardtanh ## Testing `aten/src/ATen/test/vulkan_test.cpp` - contains tests for copy from CPU to Vulkan and back all supported operations Desktop builds supported, and testing can be done on a desktop that has Vulkan supported GPU or with installed software implementation of Vulkan, like https://github.com/google/swiftshader ## Vulkan execution The initial implementation is trivial and waits every operator's execution. Pull Request resolved: https://github.com/pytorch/pytorch/pull/36491 Differential Revision: D21696709 Pulled By: IvanKobzarev fbshipit-source-id: da3e5a770b1a1995e9465d7e81963e7de56217fa
2020-05-26 02:10:31 +00:00
if(USE_VULKAN_SHADERC_RUNTIME)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_SHADERC_RUNTIME")
endif()
endif()
if(USE_PYTORCH_METAL)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL")
endif()
# ---[ Allowlist file if allowlist is specified
include(cmake/Allowlist.cmake)
# ---[ Set link flag, handle additional deps for gcc 4.8 and above
if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.8.0 AND NOT ANDROID)
message(STATUS "GCC ${CMAKE_CXX_COMPILER_VERSION}: Adding gcc and gcc_s libs to link line")
list(APPEND Caffe2_DEPENDENCY_LIBS gcc_s gcc)
endif()
2017-01-05 04:36:11 +00:00
# ---[ Build flags
Re-apply windows diff D4657831 Summary: (Note: previous revert was due to a race condition between D4657831 and D4659953 that I failed to catch.) After this, we should have contbuild guarding the Windows build both with and without CUDA. This includes a series of changes that are needed to make Windows build, specifically: (1) Various flags that are needed in the cmake system, specially dealing with /MD, /MT, cuda, cudnn, whole static linking, etc. (2) Contbuild scripts based on appveyo. (3) For Windows build, note that one will need to use "cmake --build" to build stuff so that the build type is consistent between configuration and actual build. see scripts\build_windows.bat for details. (4) In logging.h, ERROR is already defined by Windows. I don't have a good solution now, and as a result, LOG(ERROR) on windows is going to be LOG(INFO). (5) variable length array is not supported by MSVC (and it is not part of C++ standard). As a result I replaced them with vectors. (6) sched.h is not available on Windows, so akyrola 's awesome simple async net might encounter some slowdown due to no affinity setting on Windows. (7) MSVC has a bug that does not work very well with template calls inide a templated function call, which is a known issue that should be fixed in MSVC 2017. However for now this means changes to conv_op_impl.h and recurrent_net_op.h. No actual functionalities are changed. (8) std host function calls are not supported in CUDA8+MSVC, so I changed lp_pool (and maybe a few others) to use cuda device functions. (9) The current Scale and Axpy has heavy templating that does not work well with MSVC. As a result I reverted azzolini 's changes to the Scale and Axpy interface, moved the fixed-length version to ScaleFixedSize and AxpyFixedSize. (10) CUDA + MSVC does not deal with Eigen well, so I guarded all Eigen parts to only the non-CUDA part. (11) In conclusion, it is fun but painful to deal with visual c++. Differential Revision: D4666745 fbshipit-source-id: 3c9035083067bdb19a16d9c345c1ce66b6a86600
2017-03-07 18:56:26 +00:00
if(NOT MSVC)
string(APPEND CMAKE_CXX_FLAGS " -O2 -fPIC")
string(APPEND CMAKE_CXX_FLAGS " -Wno-narrowing")
# Eigen fails to build with some versions, so convert this to a warning
# Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
string(APPEND CMAKE_CXX_FLAGS " -Wall")
string(APPEND CMAKE_CXX_FLAGS " -Wextra")
string(APPEND CMAKE_CXX_FLAGS " -Werror=return-type")
string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-field-initializers")
string(APPEND CMAKE_CXX_FLAGS " -Wno-type-limits")
string(APPEND CMAKE_CXX_FLAGS " -Wno-array-bounds")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-pragmas")
string(APPEND CMAKE_CXX_FLAGS " -Wno-sign-compare")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-parameter")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-function")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-result")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-local-typedefs")
string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-overflow")
string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-aliasing")
string(APPEND CMAKE_CXX_FLAGS " -Wno-error=deprecated-declarations")
if(CMAKE_COMPILER_IS_GNUCXX AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0))
string(APPEND CMAKE_CXX_FLAGS " -Wno-stringop-overflow")
endif()
if(CMAKE_COMPILER_IS_GNUCXX)
# Suppress "The ABI for passing parameters with 64-byte alignment has changed in GCC 4.6"
string(APPEND CMAKE_CXX_FLAGS " -Wno-psabi")
endif()
string(APPEND CMAKE_CXX_FLAGS " -Wno-error=pedantic")
string(APPEND CMAKE_CXX_FLAGS " -Wno-error=redundant-decls")
string(APPEND CMAKE_CXX_FLAGS " -Wno-error=old-style-cast")
# These flags are not available in GCC-4.8.5. Set only when using clang.
# Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
string(APPEND CMAKE_CXX_FLAGS " -Wno-invalid-partial-specialization")
string(APPEND CMAKE_CXX_FLAGS " -Wno-typedef-redefinition")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-warning-option")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-private-field")
string(APPEND CMAKE_CXX_FLAGS " -Wno-inconsistent-missing-override")
string(APPEND CMAKE_CXX_FLAGS " -Wno-aligned-allocation-unavailable")
string(APPEND CMAKE_CXX_FLAGS " -Wno-c++14-extensions")
string(APPEND CMAKE_CXX_FLAGS " -Wno-constexpr-not-const")
string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-braces")
string(APPEND CMAKE_CXX_FLAGS " -Qunused-arguments")
if(${COLORIZE_OUTPUT})
string(APPEND CMAKE_CXX_FLAGS " -fcolor-diagnostics")
endif()
endif()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9)
if(${COLORIZE_OUTPUT})
string(APPEND CMAKE_CXX_FLAGS " -fdiagnostics-color=always")
endif()
endif()
if((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0")))
OR(CMAKE_COMPILER_IS_GNUCXX
AND(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0 AND NOT APPLE)))
string(APPEND CMAKE_CXX_FLAGS " -faligned-new")
endif()
if(WERROR)
check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR)
if(NOT COMPILER_SUPPORT_WERROR)
set(WERROR FALSE)
else()
string(APPEND CMAKE_CXX_FLAGS " -Werror")
endif()
endif(WERROR)
if(NOT APPLE)
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-but-set-variable")
string(APPEND CMAKE_CXX_FLAGS " -Wno-maybe-uninitialized")
endif()
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
string(APPEND CMAKE_CXX_FLAGS " -fno-math-errno")
string(APPEND CMAKE_CXX_FLAGS " -fno-trapping-math")
check_cxx_compiler_flag("-Werror=format" HAS_WERROR_FORMAT)
if(HAS_WERROR_FORMAT)
string(APPEND CMAKE_CXX_FLAGS " -Werror=format")
endif()
check_cxx_compiler_flag("-Werror=cast-function-type" HAS_WERROR_CAST_FUNCTION_TYPE)
if(HAS_WERROR_CAST_FUNCTION_TYPE)
string(APPEND CMAKE_CXX_FLAGS " -Werror=cast-function-type")
endif()
endif()
2017-01-05 04:36:11 +00:00
if(USE_ASAN)
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fsanitize=address")
string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fsanitize=address")
2018-05-16 15:10:13 +00:00
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
include(CheckCSourceCompiles)
check_c_source_compiles("#include <arm_neon.h>
int main() {
float a[] = {1.0, 1.0};
float32x4x2_t v;
[AARCH64] Fix HAS_VST1 check if compiled by clang (#49182) Summary: Use `UL` suffix supported by all C99 compatible compilers instead of `__AARCH64_UINT64_C`, which is a gcc specific extension Before the change this check would have failed as follows with a bug-free clang compiler with the following errors: ``` $ clang has_vst1.c has_vst1.c:5:41: warning: implicit declaration of function '__AARCH64_UINT64_C' is invalid in C99 [-Wimplicit-function-declaration] v.val[0] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); ^ has_vst1.c:5:79: warning: implicit declaration of function '__AARCH64_UINT64_C' is invalid in C99 [-Wimplicit-function-declaration] v.val[0] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); ^ has_vst1.c:6:41: warning: implicit declaration of function '__AARCH64_UINT64_C' is invalid in C99 [-Wimplicit-function-declaration] v.val[1] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); ^ has_vst1.c:6:79: warning: implicit declaration of function '__AARCH64_UINT64_C' is invalid in C99 [-Wimplicit-function-declaration] v.val[1] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); ^ 4 warnings generated. /tmp/has_vst1-b1e162.o: In function `main': has_vst1.c:(.text+0x30): undefined reference to `__AARCH64_UINT64_C' ``` Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/49182 Reviewed By: walterddr Differential Revision: D25471994 Pulled By: malfet fbshipit-source-id: 0129a6f7aabc46aa117ef719d3a211449cb410f1
2020-12-10 23:17:30 +00:00
v.val[0] = vcombine_f32 (vcreate_f32 (0UL), vcreate_f32 (0UL));
v.val[1] = vcombine_f32 (vcreate_f32 (0UL), vcreate_f32 (0UL));
vst1q_f32_x2(a, v);
return 0;
}" HAS_VST1)
if(NOT HAS_VST1)
string(APPEND CMAKE_CXX_FLAGS " -DMISSING_ARM_VST1")
endif()
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
include(CheckCSourceCompiles)
check_c_source_compiles("#include <arm_neon.h>
int main() {
float a[] = {1.0, 1.0};
vld1q_f32_x2(a);
return 0;
}" HAS_VLD1)
if(NOT HAS_VLD1)
string(APPEND CMAKE_CXX_FLAGS " -DMISSING_ARM_VLD1")
endif()
endif()
# Add code coverage flags to supported compilers
if(USE_CPP_CODE_COVERAGE)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path")
string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path")
elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping")
string(APPEND CMAKE_CXX_FLAGS " -fprofile-instr-generate -fcoverage-mapping")
else()
message(ERROR "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported")
endif()
endif()
if(APPLE)
if(USE_MLCOMPUTE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MLCOMPUTE -fobjc-arc -framework MLCompute -framework Metal")
endif()
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-private-field")
string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-braces")
string(APPEND CMAKE_CXX_FLAGS " -Wno-c++14-extensions")
string(APPEND CMAKE_CXX_FLAGS " -Wno-constexpr-not-const")
endif()
if(EMSCRIPTEN)
string(APPEND CMAKE_CXX_FLAGS " -Wno-implicit-function-declaration -DEMSCRIPTEN -s DISABLE_EXCEPTION_CATCHING=0")
endif()
if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0.0)
string(APPEND CMAKE_CXX_FLAGS " -Wno-stringop-overflow")
endif()
Test application for profiling, CMake params for debug symbols (#28406) Summary: Reason: To have one-step build for test android application based on the current code state that is ready for profiling with simpleperf, systrace etc. to profile performance inside the application. ## Parameters to control debug symbols stripping Introducing /CMakeLists parameter `ANDROID_DEBUG_SYMBOLS` to be able not to strip symbols for pytorch (not add linker flag `-s`) which is checked in `scripts/build_android.sh` On gradle side stripping happens by default, and to prevent it we have to specify ``` android { packagingOptions { doNotStrip "**/*.so" } } ``` which is now controlled by new gradle property `nativeLibsDoNotStrip ` ## Test_App `android/test_app` - android app with one MainActivity that does inference in cycle `android/build_test_app.sh` - script to build libtorch with debug symbols for specified android abis and adds `NDK_DEBUG=1` and `-PnativeLibsDoNotStrip=true` to keep all debug symbols for profiling. Script assembles all debug flavors: ``` └─ $ find . -type f -name *apk ./test_app/app/build/outputs/apk/mobilenetQuant/debug/test_app-mobilenetQuant-debug.apk ./test_app/app/build/outputs/apk/resnet/debug/test_app-resnet-debug.apk ``` ## Different build configurations Module for inference can be set in `android/test_app/app/build.gradle` as a BuildConfig parameters: ``` productFlavors { mobilenetQuant { dimension "model" applicationIdSuffix ".mobilenetQuant" buildConfigField ("String", "MODULE_ASSET_NAME", buildConfigProps('MODULE_ASSET_NAME_MOBILENET_QUANT')) addManifestPlaceholders([APP_NAME: "PyMobileNetQuant"]) buildConfigField ("String", "LOGCAT_TAG", "\"pytorch-mobilenet\"") } resnet { dimension "model" applicationIdSuffix ".resnet" buildConfigField ("String", "MODULE_ASSET_NAME", buildConfigProps('MODULE_ASSET_NAME_RESNET18')) addManifestPlaceholders([APP_NAME: "PyResnet"]) buildConfigField ("String", "LOGCAT_TAG", "\"pytorch-resnet\"") } ``` In that case we can setup several apps on the same device for comparison, to separate packages `applicationIdSuffix`: 'org.pytorch.testapp.mobilenetQuant' and different application names and logcat tags as `manifestPlaceholder` and another BuildConfig parameter: ``` ─ $ adb shell pm list packages | grep pytorch package:org.pytorch.testapp.mobilenetQuant package:org.pytorch.testapp.resnet ``` In future we can add another BuildConfig params e.g. single/multi threads and other configuration for profiling. At the moment 2 flavors - for resnet18 and for mobilenetQuantized which can be installed on connected device: ``` cd android ``` ``` gradle test_app:installMobilenetQuantDebug ``` ``` gradle test_app:installResnetDebug ``` ## Testing: ``` cd android sh build_test_app.sh adb install -r test_app/app/build/outputs/apk/mobilenetQuant/debug/test_app-mobilenetQuant-debug.apk ``` ``` cd $ANDROID_NDK python simpleperf/run_simpleperf_on_device.py record --app org.pytorch.testapp.mobilenetQuant -g --duration 10 -o /data/local/tmp/perf.data adb pull /data/local/tmp/perf.data python simpleperf/report_html.py ``` Simpleperf report has all symbols: ![Screenshot 2019-10-22 11 06 21](https://user-images.githubusercontent.com/6638825/67315740-0bc50100-f4bc-11e9-8f9e-2499be13d63e.png) Pull Request resolved: https://github.com/pytorch/pytorch/pull/28406 Differential Revision: D18386622 Pulled By: IvanKobzarev fbshipit-source-id: 3a751192bbc4bc3c6d7f126b0b55086b4d586e7a
2019-11-08 22:17:15 +00:00
if(ANDROID AND (NOT ANDROID_DEBUG_SYMBOLS))
2018-03-02 14:24:05 +00:00
if(CMAKE_COMPILER_IS_GNUCXX)
string(APPEND CMAKE_CXX_FLAGS " -s")
elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
string(APPEND CMAKE_CXX_FLAGS " -g0")
2018-03-02 14:24:05 +00:00
else()
string(APPEND CMAKE_EXE_LINKER_FLAGS " -s")
2018-03-02 14:24:05 +00:00
endif()
endif()
if(NOT APPLE AND UNIX)
list(APPEND Caffe2_DEPENDENCY_LIBS dl)
endif()
# Prefix path to Caffe2 headers.
# If a directory containing installed Caffe2 headers was inadvertently
# added to the list of include directories, prefixing
# PROJECT_SOURCE_DIR means this source tree always takes precedence.
include_directories(BEFORE ${PROJECT_SOURCE_DIR})
# Prefix path to generated Caffe2 headers.
# These need to take precedence over their empty counterparts located
# in PROJECT_SOURCE_DIR.
include_directories(BEFORE ${PROJECT_BINARY_DIR})
include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/)
include_directories(BEFORE ${PROJECT_BINARY_DIR}/aten/src/)
# ---[ Main build
add_subdirectory(c10)
2016-12-05 00:42:00 +00:00
add_subdirectory(caffe2)
# --[ Documentation
if(BUILD_DOCS)
# check if Doxygen is installed
find_package(Doxygen)
if(DOXYGEN_FOUND)
message("Generating documentation")
set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-c)
set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-c)
set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-python)
set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-python)
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
endif()
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY)
add_custom_target(doc_doxygen_c ALL
COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating C++ API documentation with Doxygen"
VERBATIM)
add_custom_target(doc_doxygen_python ALL
COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating Python API documentation with Doxygen"
VERBATIM)
else()
message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
endif()
endif()
# ---[ CMake related files
# Uninistall option.
if(NOT TARGET caffe2_uninstall)
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
IMMEDIATE @ONLY)
add_custom_target(caffe2_uninstall
COMMAND ${CMAKE_COMMAND} -P
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
endif()
# ---[ Make configuration files for cmake to allow dependent libraries
# easier access to Caffe2.
if((NOT USE_GLOG) OR (NOT USE_GFLAGS) OR BUILD_CUSTOM_PROTOBUF)
message(WARNING
"Generated cmake files are only fully tested if one builds "
"with system glog, gflags, and protobuf. Other settings may "
"generate files that are not well tested.")
endif()
if(USE_CUDA OR USE_ROCM)
# TODO: check if we should include other cuda dependency libraries
# to the interface as well.
endif()
# Note(jiayq): when building static libraries, all PRIVATE dependencies
# will also become interface libraries, and as a result if there are any
# dependency libraries that are not exported, the following install export
# script will fail. As a result, we will only provide the targets cmake
# files for shared lib installation. For more info, read:
# https://cmake.org/pipermail/cmake/2016-May/063400.html
if(BUILD_SHARED_LIBS)
configure_file(
${PROJECT_SOURCE_DIR}/cmake/Caffe2ConfigVersion.cmake.in
${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
@ONLY)
configure_file(
${PROJECT_SOURCE_DIR}/cmake/Caffe2Config.cmake.in
${PROJECT_BINARY_DIR}/Caffe2Config.cmake
@ONLY)
install(FILES
${PROJECT_BINARY_DIR}/Caffe2ConfigVersion.cmake
${PROJECT_BINARY_DIR}/Caffe2Config.cmake
DESTINATION share/cmake/Caffe2
COMPONENT dev)
install(FILES
${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake
${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake
${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
${PROJECT_SOURCE_DIR}/cmake/public/mkl.cmake
${PROJECT_SOURCE_DIR}/cmake/public/mkldnn.cmake
${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
${PROJECT_SOURCE_DIR}/cmake/public/threads.cmake
${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
DESTINATION share/cmake/Caffe2/public
COMPONENT dev)
install(DIRECTORY
${PROJECT_SOURCE_DIR}/cmake/Modules_CUDA_fix
DESTINATION share/cmake/Caffe2/
COMPONENT dev)
install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
FILE Caffe2Targets.cmake
COMPONENT dev)
else()
message(WARNING
"Generated cmake files are only available when building "
"shared libs.")
endif()
# ---[ Modules
# If master flag for buildling Caffe2 is disabled, we also disable the
# build for Caffe2 related operator modules.
if(BUILD_CAFFE2)
add_subdirectory(modules)
endif()
2018-03-01 20:01:44 +00:00
# ---[ Binaries
# Binaries will be built after the Caffe2 main libraries and the modules
# are built. For the binaries, they will be linked to the Caffe2 main
# libraries, as well as all the modules that are built with Caffe2 (the ones
# built in the previous Modules section above).
if(BUILD_BINARY)
add_subdirectory(binaries)
endif()
# ---[ JNI
if(BUILD_JNI)
set(BUILD_LIBTORCH_WITH_JNI 1)
set(FBJNI_SKIP_TESTS 1)
add_subdirectory(android/pytorch_android)
endif()
2018-03-01 20:01:44 +00:00
include(cmake/Summary.cmake)
caffe2_print_configuration_summary()
# ---[ Torch Deploy
if(USE_DEPLOY)
add_subdirectory(torch/csrc/deploy)
endif()