diff --git a/.gitmodules b/.gitmodules index b3eadb8b3e5..098255cf0c9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,3 @@ -[submodule "third_party/tbb"] - path = third_party/tbb - url = https://github.com/01org/tbb - branch = tbb_2018 [submodule "third_party/catch"] path = third_party/catch url = https://github.com/catchorg/Catch2.git diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index a60067bec24..cf49dcb5b1c 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -89,24 +89,6 @@ IF(NOT AT_LINK_STYLE) SET(AT_LINK_STYLE SHARED) ENDIF() -# Unset our restrictive C++ flags here and reset them later. -# Remove this once we use proper target_compile_options. -set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -set(CMAKE_CXX_FLAGS) - -set(TBB_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/tbb") -set(TBB_BUILD_STATIC ON CACHE BOOL " " FORCE) -set(TBB_BUILD_SHARED OFF CACHE BOOL " " FORCE) -set(TBB_BUILD_TBBMALLOC OFF CACHE BOOL " " FORCE) -set(TBB_BUILD_TBBMALLOC_PROXY OFF CACHE BOOL " " FORCE) -set(TBB_BUILD_TESTS OFF CACHE BOOL " " FORCE) -add_subdirectory(cpu/tbb) -set_property(TARGET tbb_static tbb_def_files PROPERTY FOLDER "dependencies") -list(APPEND ATen_THIRD_PARTY_INCLUDE ${TBB_ROOT_DIR}/include) -list(APPEND ATen_CPU_DEPENDENCY_LIBS tbb_static) - -set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) - IF(BLAS_FOUND) IF ($ENV{TH_BINARY_BUILD}) MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.") diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index d46d1dc388b..a6f57662581 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -149,7 +149,6 @@ inline bool _apply_preamble(ArrayRef tensors) { for (auto& t : tensors) if (t.sizes().equals({0})) return false; - internal::init_tbb_num_threads(); return true; } @@ -351,7 +350,7 @@ template inline void CPU_tensor_parallel_apply1( Tensor tensor1, const Op op, - int64_t grain_size = internal::TBB_GRAIN_SIZE) { + int64_t grain_size = internal::GRAIN_SIZE) { if (!_apply_preamble({tensor1})) return; if (tensor1.ndimension() < 8) { @@ -383,7 +382,7 @@ inline void CPU_tensor_parallel_apply2( Tensor tensor1, Tensor tensor2, const Op op, - int64_t grain_size = internal::TBB_GRAIN_SIZE) { + int64_t grain_size = internal::GRAIN_SIZE) { if (!_apply_preamble({tensor1, tensor2})) return; if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) { diff --git a/aten/src/ATen/Parallel.cpp b/aten/src/ATen/Parallel.cpp deleted file mode 100644 index 961cc6df31b..00000000000 --- a/aten/src/ATen/Parallel.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace at { namespace internal { - -// thread_local variable with internal linkage -// requires no guarding as it's storage duration is defined to be per thread -static thread_local tbb::task_scheduler_init tbbinit( - tbb::task_scheduler_init::deferred); -// Tracks number of threads uses which TBB doesn't track. -static thread_local int num_threads_ = -1; - -// Negative number of threads means default value -void init_tbb_num_threads() { - static thread_local bool first_call = true; - int num_threads = at::get_num_threads(); - // In order to have control over the number of threads this function - // must be called first before any other tbb parallel construct is - // excercised within a particular thread. Otherwise the default - // scheduler will be created over which we do not have control. - // The following code will and must throw an error if tbb has - // already been initialized before this function was called. - if (!tbbinit.is_active() && !first_call) - throw std::runtime_error( - "tbb initialization failed: scheduler not active after first call"); - if (first_call) { - if (tbbinit.is_active()) - throw std::runtime_error( - "tbb initialization failed: scheduler active on first call"); - if (num_threads < 0) { - int max_threads = tbbinit.default_num_threads(); - tbbinit.initialize(max_threads); - } else { - tbbinit.initialize(num_threads); - } - first_call = false; - } - if (num_threads == 0) { - // TODO: For PyTorch 0 means 1 - num_threads = 1; - } - if (num_threads > 0 && (num_threads_ != num_threads)) { - tbbinit.terminate(); - tbbinit.initialize(num_threads); - num_threads_ = num_threads; - } -} -} // namespace internal -} // namespace at diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index b7c5c2ce9fd..0f7e617e8d7 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -1,54 +1,59 @@ #pragma once #include #include -#include +#ifdef _OPENMP +#include +#endif namespace at { namespace internal { -// This needs to be called before the first use of any algorithm such as -// parallel or it will have no effect and the default task scheduler is -// created which uses all available cores. -// See -// https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/task_scheduler_init_cls.html -// This does not initializes the number of workers in the market (the overall -// of workers available to a process). It is merely a request to the market -// for a certain number of workers. If there are multiple threads making -// a request at the size of the maximum number of threads, they will -// be allocated a number proportional to the other requests. -AT_API void init_tbb_num_threads(); // This parameter is heuristically chosen to determine the minimum number of // work that warrants paralellism. For example, when summing an array, it is // deemed inefficient to parallelise over arrays shorter than 32768. Further, // no parallel algorithm (such as parallel_reduce) should split work into // smaller than GRAIN_SIZE chunks. -constexpr int64_t TBB_GRAIN_SIZE = 32768; +constexpr int64_t GRAIN_SIZE = 32768; } // namespace internal +inline int64_t divup(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + template inline void parallel_for( - int64_t begin, - int64_t end, - int64_t grain_size, - const F& f) { - internal::init_tbb_num_threads(); - -#ifdef __PPC64__ - using default_partitioner_type = tbb::simple_partitioner; -#else - using default_partitioner_type = tbb::affinity_partitioner; -#endif - - thread_local static default_partitioner_type ap; - - if ((end - begin) < grain_size || get_num_threads() == 1) { - f(begin, end); - } else { - tbb::parallel_for( - tbb::blocked_range(begin, end, grain_size), - [f](const tbb::blocked_range& r) { f(r.begin(), r.end()); }, - ap); + const int64_t begin, + const int64_t end, + const int64_t grain_size_, + const F f) { + const int64_t min_grain_size = divup((end - begin), get_num_threads()); + const int64_t grain_size = std::max(min_grain_size, grain_size_); +#pragma omp parallel for if ((end - begin) >= grain_size && get_num_threads() > 1) + for (int64_t i = begin; i < end; i += grain_size) { + f(i, i + std::min(end - i, grain_size)); } } +template +inline scalar_t parallel_reduce( + const int64_t begin, + const int64_t end, + const int64_t grain_size_, + const scalar_t ident, + const F f, + const SF sf) { + const int64_t min_grain_size = divup((end - begin), get_num_threads()); + const int64_t grain_size = std::max(min_grain_size, grain_size_); + const int64_t num_results = divup((end - begin), grain_size); + std::vector results(num_results); + scalar_t* results_data = results.data(); +#pragma omp parallel for if ((end - begin) >= grain_size && get_num_threads() > 1) + for (int64_t id = 0; id < num_results; id++) { + int64_t i = begin + id * grain_size; + results_data[id] = f(i, i + std::min(end - i, grain_size), ident); + } + return std::accumulate( + results_data, results_data + results.size(), ident, sf); +} + } // namespace at diff --git a/aten/src/ATen/cpu/tbb/CMakeLists.txt b/aten/src/ATen/cpu/tbb/CMakeLists.txt deleted file mode 100644 index 84355926d2d..00000000000 --- a/aten/src/ATen/cpu/tbb/CMakeLists.txt +++ /dev/null @@ -1,376 +0,0 @@ -# Based on https://github.com/wjakob/tbb/blob/master/CMakeLists.txt -# All credit goes to Wenzel Jakob! - -cmake_minimum_required (VERSION 2.8.12 FATAL_ERROR) -project (tbb CXX) - -include(CheckCXXCompilerFlag) -include(CheckCXXSourceRuns) - -if(POLICY CMP0058) - cmake_policy(SET CMP0058 NEW) -endif() - -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - message(STATUS "Setting build type to 'Release' as none was specified.") - set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" - "MinSizeRel" "RelWithDebInfo") -endif() - -if(NOT TBB_ROOT_DIR) - set(TBB_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -endif() -if(NOT TBB_INSTALL_RUNTIME_DIR) - set(TBB_INSTALL_RUNTIME_DIR bin) -endif() -if(NOT TBB_INSTALL_LIBRARY_DIR) - set(TBB_INSTALL_LIBRARY_DIR lib) -endif() -if(NOT TBB_INSTALL_ARCHIVE_DIR) - set(TBB_INSTALL_ARCHIVE_DIR lib) -endif() -if(NOT TBB_INSTALL_INCLUDE_DIR) - set(TBB_INSTALL_INCLUDE_DIR "${TBB_ROOT_DIR}/include") -endif() - -set(TBB_INCLUDES - "${TBB_ROOT_DIR}/include" - "${TBB_ROOT_DIR}/src" - "${TBB_ROOT_DIR}/src/rml/include" - ${CMAKE_CURRENT_BINARY_DIR}) - -option(TBB_BUILD_SHARED "Build TBB shared library" ON) -option(TBB_BUILD_STATIC "Build TBB static library" ON) -option(TBB_BUILD_TBBMALLOC "Build TBB malloc library" ON) -option(TBB_BUILD_TBBMALLOC_PROXY "Build TBB malloc proxy library" ON) -option(TBB_BUILD_TESTS "Build TBB tests and enable testing infrastructure" ON) -option(TBB_CI_BUILD "Is this a continuous integration build?" OFF) - -if(APPLE) - set(CMAKE_MACOSX_RPATH ON) -endif() - -file(GLOB tbb_src "${TBB_ROOT_DIR}/src/tbb/*.cpp" "${TBB_ROOT_DIR}/src/old/*.cpp") -list(APPEND tbb_src ${TBB_ROOT_DIR}/src/rml/client/rml_tbb.cpp) -file(GLOB to_remove "${TBB_ROOT_DIR}/src/old/test*.cpp") -if (NOT "${to_remove}" STREQUAL "") - list(REMOVE_ITEM tbb_src ${to_remove}) -endif() - -set(tbbmalloc_static_src - src/tbbmalloc/backend.cpp - src/tbbmalloc/large_objects.cpp - src/tbbmalloc/backref.cpp - src/tbbmalloc/tbbmalloc.cpp - src/tbbmalloc/frontend.cpp - src/tbb/itt_notify.cpp) - -set(tbbmalloc_src ${tbbmalloc_static_src}) - -set(tbbmalloc_proxy_src - src/tbbmalloc/proxy.cpp - src/tbbmalloc/tbb_function_replacement.cpp) - -if (CMAKE_SYSTEM_PROCESSOR MATCHES "(i386|x86_64)") - if (NOT APPLE AND NOT MINGW) - add_definitions(-DDO_ITT_NOTIFY) - endif() -endif() - -if (APPLE) - # Disable annoying "has no symbols" warnings - set(CMAKE_C_ARCHIVE_CREATE " Scr ") - set(CMAKE_CXX_ARCHIVE_CREATE " Scr ") - set(CMAKE_C_ARCHIVE_FINISH " -no_warning_for_no_symbols -c ") - set(CMAKE_CXX_ARCHIVE_FINISH " -no_warning_for_no_symbols -c ") -endif() - -macro(CHECK_CXX_COMPILER_AND_LINKER_FLAGS _RESULT _CXX_FLAGS _LINKER_FLAGS) - set(CMAKE_REQUIRED_FLAGS ${_CXX_FLAGS}) - set(CMAKE_REQUIRED_LIBRARIES ${_LINKER_FLAGS}) - set(CMAKE_REQUIRED_QUIET TRUE) - check_cxx_source_runs("#include \nint main(int argc, char **argv) { std::cout << \"test\"; return 0; }" ${_RESULT}) - set(CMAKE_REQUIRED_FLAGS "") - set(CMAKE_REQUIRED_LIBRARIES "") -endmacro() - -# Prefer libc++ in conjunction with Clang -if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - if (CMAKE_CXX_FLAGS MATCHES "-stdlib=libc\\+\\+") - message(STATUS "TBB: using libc++.") - else() - CHECK_CXX_COMPILER_AND_LINKER_FLAGS(HAS_LIBCPP "-stdlib=libc++" "-stdlib=libc++") - if (HAS_LIBCPP) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++ -D_LIBCPP_VERSION") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++") - message(STATUS "TBB: using libc++.") - else() - message(STATUS "TBB: NOT using libc++.") - endif() - endif() -endif() - -if (UNIX) - add_definitions (-DUSE_PTHREAD) - - check_cxx_compiler_flag ("-std=c++11" SUPPORTS_STDCXX11) - if (SUPPORTS_STDCXX11) - set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") - endif () - - check_cxx_compiler_flag ("-mrtm -Werror" SUPPORTS_MRTM) - if (SUPPORTS_MRTM) - set (CMAKE_CXX_FLAGS "-mrtm ${CMAKE_CXX_FLAGS}") - endif () - -elseif(WIN32) - if (MSVC) - cmake_minimum_required (VERSION 3.1) - enable_language(ASM_MASM) - set(CMAKE_CXX_FLAGS "/GS- /Zc:wchar_t /Zc:forScope /DUSE_WINTHREAD ${CMAKE_CXX_FLAGS}") - set(CMAKE_CXX_FLAGS "/D_CRT_SECURE_NO_DEPRECATE /D_WIN32_WINNT=0x0600 ${CMAKE_CXX_FLAGS}") - check_cxx_compiler_flag ("/volatile:iso" SUPPORTS_VOLATILE_FLAG) - if (SUPPORTS_VOLATILE_FLAG) - set(CMAKE_CXX_FLAGS "/volatile:iso ${CMAKE_CXX_FLAGS}") - endif () - set(CMAKE_CXX_FLAGS "/wd4267 /wd4800 /wd4146 /wd4244 /wd4577 /wd4018 ${CMAKE_CXX_FLAGS}") - if (NOT CMAKE_SIZEOF_VOID_P) - message(FATAL_ERROR "'CMAKE_SIZEOF_VOID_P' is undefined. Please delete your build directory and rerun CMake again!") - endif() - - if (CMAKE_SIZEOF_VOID_P EQUAL 8) - list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/atomic_support.asm") - list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/itsx.asm") - list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/intel64_misc.asm") - list(APPEND tbbmalloc_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/atomic_support.asm") - set(CMAKE_ASM_MASM_FLAGS "/DEM64T=1 ${CMAKE_ASM_MASM_FLAGS}") - else() - list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/ia32-masm/atomic_support.asm" - "${TBB_ROOT_DIR}/src/tbb/ia32-masm/itsx.asm src/tbb/ia32-masm/lock_byte.asm") - # Enable SAFESEH feature for assembly (x86 builds only). - set(CMAKE_ASM_MASM_FLAGS "/safeseh ${CMAKE_ASM_MASM_FLAGS}") - endif() - elseif (MINGW) - add_definitions(-DUSE_WINTHREAD) - add_definitions(-D_WIN32_WINNT=0x0502) - set(CMAKE_CXX_FLAGS "-mthreads ${CMAKE_CXX_FLAGS}") - endif () -endif() - -if (MSVC) - set(ENABLE_RTTI "/EHsc /GR ") - set(DISABLE_RTTI "/EHs- /GR- ") -elseif (UNIX) - set(ENABLE_RTTI "-frtti -fexceptions ") - set(DISABLE_RTTI "-fno-rtti -fno-exceptions ") -endif () - -##-------- -# - Added TBB_USE_GLIBCXX_VERSION macro to specify the version of GNU -# libstdc++ when it cannot be properly recognized, e.g. when used -# with Clang on Linux* OS. Inspired by a contribution from David A. -if (NOT TBB_USE_GLIBCXX_VERSION AND UNIX AND NOT APPLE) - if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - # using Clang - string(REPLACE "." "0" TBB_USE_GLIBCXX_VERSION ${CMAKE_CXX_COMPILER_VERSION}) - endif() -endif() - -if (TBB_USE_GLIBCXX_VERSION) - add_definitions(-DTBB_USE_GLIBCXX_VERSION=${TBB_USE_GLIBCXX_VERSION}) -endif() - -##------- - -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - check_cxx_compiler_flag ("-flifetime-dse=1" SUPPORTS_FLIFETIME) - if (SUPPORTS_FLIFETIME) - add_definitions(-flifetime-dse=1) - endif() -endif() - -# Linker export definitions -if (APPLE) - set (ARCH_PREFIX "mac") -elseif(WIN32) - set (ARCH_PREFIX "win") -else() - set (ARCH_PREFIX "lin") -endif() - -if (CMAKE_SIZEOF_VOID_P EQUAL 8) - set(ARCH_PREFIX "${ARCH_PREFIX}64") -else() - set(ARCH_PREFIX "${ARCH_PREFIX}32") -endif() - -if (MINGW) - set (ARCH_PREFIX "${ARCH_PREFIX}-gcc") - # there's no win32-gcc-tbb-export.def, use lin32-tbb-export.def - execute_process (COMMAND ${CMAKE_COMMAND} -E copy ${TBB_ROOT_DIR}/src/tbb/lin32-tbb-export.def ${TBB_ROOT_DIR}/src/tbb/win32-gcc-tbb-export.def) -endif() - -if (MSVC) - add_custom_command(OUTPUT tbb.def - COMMAND ${CMAKE_CXX_COMPILER} /TC /EP ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def -I ${TBB_ROOT_DIR}/include > tbb.def - MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def - COMMENT "Preprocessing tbb.def" - ) - - add_custom_command(OUTPUT tbbmalloc.def - COMMAND ${CMAKE_CXX_COMPILER} /TC /EP ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def -I ${TBB_ROOT_DIR}/include > tbbmalloc.def - MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def - COMMENT "Preprocessing tbbmalloc.def" - ) -else() - add_custom_command(OUTPUT tbb.def - COMMAND ${CMAKE_CXX_COMPILER} -xc++ -E ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def -I ${TBB_ROOT_DIR}/include -o tbb.def - MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def - COMMENT "Preprocessing tbb.def" - ) - - add_custom_command(OUTPUT tbbmalloc.def - COMMAND ${CMAKE_CXX_COMPILER} -xc++ -E ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def -I ${TBB_ROOT_DIR}/include -o tbbmalloc.def - MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def - COMMENT "Preprocessing tbbmalloc.def" - ) -endif() - -add_custom_target(tbb_def_files DEPENDS tbb.def tbbmalloc.def) - -# TBB library -if (TBB_BUILD_STATIC) - add_library(tbb_static STATIC ${tbb_src}) - target_include_directories(tbb_static PRIVATE ${TBB_INCLUDES}) - set_property(TARGET tbb_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBB_BUILD=1") - set_property(TARGET tbb_static APPEND_STRING PROPERTY COMPILE_FLAGS ${ENABLE_RTTI}) - install(TARGETS tbb_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}) - if (MSVC) - target_compile_definitions(tbb_static PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1) - endif() - - if (UNIX AND NOT APPLE) - target_link_libraries(tbb_static PUBLIC pthread dl) - endif() -endif() - -if (TBB_BUILD_SHARED) - add_library(tbb SHARED ${tbb_src}) - target_include_directories(tbb PRIVATE ${TBB_INCLUDES}) - set_property(TARGET tbb APPEND PROPERTY COMPILE_DEFINITIONS "__TBB_BUILD=1") - set_property(TARGET tbb APPEND_STRING PROPERTY COMPILE_FLAGS ${ENABLE_RTTI}) - add_dependencies(tbb tbb_def_files) - - if (APPLE) - set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "-Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"") - elseif (MSVC) - set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "/DEF:\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"") - else () - set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "-Wl,-version-script,\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"") - endif() - - install(TARGETS tbb - LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR} - ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR} - RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR}) - if (UNIX AND NOT APPLE) - target_link_libraries(tbb PUBLIC pthread dl) - endif() - if (MSVC) - target_compile_definitions(tbb PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1) - endif() -endif() - - -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - # Quench a warning on GCC - set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/governor.cpp COMPILE_FLAGS "-Wno-missing-field-initializers ") -elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - # Quench a warning on Clang - set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/itt_notify.cpp COMPILE_FLAGS "-Wno-varargs ") -elseif(MSVC) - # Quench a warning on MSVC - set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/scheduler.cpp COMPILE_FLAGS "/wd4458 ") -endif() - -if(TBB_BUILD_TBBMALLOC) - # TBB malloc library - if (TBB_BUILD_STATIC) - add_library(tbbmalloc_static STATIC ${tbbmalloc_static_src}) - target_include_directories(tbbmalloc_static PRIVATE ${TBB_INCLUDES}) - set_property(TARGET tbbmalloc_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1") - set_property(TARGET tbbmalloc_static APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI}) - if (MSVC) - target_compile_definitions(tbbmalloc_static PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1 __TBBMALLOC_NO_IMPLICIT_LINKAGE=1) - endif() - install(TARGETS tbbmalloc_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}) - endif() - - if (TBB_BUILD_SHARED) - add_library(tbbmalloc SHARED ${tbbmalloc_src}) - target_include_directories(tbbmalloc PRIVATE ${TBB_INCLUDES}) - set_property(TARGET tbbmalloc APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1") - set_property(TARGET tbbmalloc APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI}) - add_dependencies(tbbmalloc tbb_def_files) - if (APPLE) - set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "-Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"") - elseif (MSVC) - set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "/DEF:\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"") - else () - set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "-Wl,-version-script,\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"") - endif() - if (MSVC) - target_compile_definitions(tbbmalloc PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1 __TBBMALLOC_NO_IMPLICIT_LINKAGE=1) - endif() - install(TARGETS tbbmalloc - LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR} - ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR} - RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR}) - if (UNIX AND NOT APPLE) - target_link_libraries(tbbmalloc PUBLIC pthread dl) - endif() - endif() -endif() - -if(TBB_BUILD_TBBMALLOC_PROXY) - # TBB malloc proxy library - if (TBB_BUILD_STATIC) - add_library(tbbmalloc_proxy_static STATIC ${tbbmalloc_proxy_src}) - set_property(TARGET tbbmalloc_proxy_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1") - set_property(TARGET tbbmalloc_proxy_static APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI}) - install(TARGETS tbbmalloc_proxy_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}) - endif() - - if (TBB_BUILD_SHARED) - add_library(tbbmalloc_proxy SHARED ${tbbmalloc_proxy_src}) - set_property(TARGET tbbmalloc_proxy APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1") - set_property(TARGET tbbmalloc_proxy APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI}) - target_link_libraries(tbbmalloc_proxy PUBLIC tbbmalloc) - install(TARGETS tbbmalloc_proxy - LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR} - ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR} - RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR}) - if (UNIX AND NOT APPLE) - target_link_libraries(tbbmalloc_proxy PUBLIC pthread dl) - endif() - endif() -endif() - -install(DIRECTORY "${TBB_ROOT_DIR}/include/tbb" DESTINATION ${TBB_INSTALL_INCLUDE_DIR}) - -# version_string.ver -if (UNIX) - execute_process (COMMAND date "+%a, %d %b %Y %H:%M:%S %z" - OUTPUT_VARIABLE _configure_date - OUTPUT_STRIP_TRAILING_WHITESPACE) -elseif (WIN32) - execute_process (COMMAND cmd " /C date /T" - OUTPUT_VARIABLE _configure_date - OUTPUT_STRIP_TRAILING_WHITESPACE) -else () - set (_configure_date "Unknown") -endif() -include_directories (${CMAKE_BINARY_DIR}) -configure_file (extra/version_string.ver.in version_string.ver @ONLY) diff --git a/aten/src/ATen/cpu/tbb/extra/version_string.ver.in b/aten/src/ATen/cpu/tbb/extra/version_string.ver.in deleted file mode 100644 index bb9f96e8f29..00000000000 --- a/aten/src/ATen/cpu/tbb/extra/version_string.ver.in +++ /dev/null @@ -1,11 +0,0 @@ -#define __TBB_VERSION_STRINGS(N) \ -#N": BUILD_HOST @CMAKE_SYSTEM_NAME@" ENDL \ -#N": BUILD_OS @CMAKE_SYSTEM@" ENDL \ -#N": BUILD_KERNEL @CMAKE_SYSTEM_VERSION@" ENDL \ -#N": BUILD_GCC @CMAKE_CXX_COMPILER_ID@" ENDL \ -#N": BUILD_LIBC Unknown" ENDL \ -#N": BUILD_LD Unknown" ENDL \ -#N": BUILD_TARGET Unknown" ENDL \ -#N": BUILD_COMMAND Unknown" ENDL - -#define __TBB_DATETIME "@_configure_date@" diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h index 86657989649..e096977e006 100644 --- a/aten/src/ATen/cpu/vec256/vec256_base.h +++ b/aten/src/ATen/cpu/vec256/vec256_base.h @@ -22,7 +22,7 @@ namespace { template struct Vec256 { static constexpr int size = 32 / sizeof(T); - T values[32 / sizeof(T)]; + T values[32 / sizeof(T)] = {0}; Vec256() {} Vec256(T val) { for (int i = 0; i != size; i++) { diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index 02653629e55..ff83c78f455 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -23,7 +23,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { int64_t outer_stride = dim_size * dim_stride; scalar_t* input_data_base = input.data(); scalar_t* output_data_base = output.data(); - int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1); + int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1); parallel_for( 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { @@ -80,7 +80,7 @@ void host_softmax_backward( scalar_t* gradInput_data_base = gI.data(); scalar_t* output_data_base = output.data(); scalar_t* gradOutput_data_base = grad.data(); - int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1); + int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1); parallel_for( 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { for (int64_t i = begin; i < end; i++) { diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 77eb50acae3..7019ccc4f18 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -9,12 +9,6 @@ #include "ATen/cpu/vec256/vec256.h" #include "ATen/optional.h" -#ifdef __PPC64__ -using default_partitioner_type = tbb::simple_partitioner; -#else -using default_partitioner_type = tbb::affinity_partitioner; -#endif - namespace at { namespace native { namespace { using namespace vec256; @@ -23,19 +17,22 @@ static inline int64_t round_down(int64_t a, int64_t m) { return a - (a % m); } -template -static void parallel_for(int64_t end, int64_t step, bool parallelize, F func) { +template +static void _parallel_for(int64_t size, int64_t step, bool parallelize, F func) { if (parallelize) { - tbb::parallel_for(0, end, step, func); + parallel_for(0, size / step, 1, [func, step](int64_t begin, int64_t end) { + int64_t k = begin * step; + for (int64_t i = begin; i < end; i++, k += step) { + func(k); + } + }); } else { - for (int64_t i = 0; i != end; i += step) { + for (int64_t i = 0; i != size; i += step) { func(i); } } } -static default_partitioner_type ap; - // Vectorized reduction defined by reduce operation `Op` with identity `ident`. // The reduction is built on top of reduce128, which reduces down a column // 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen @@ -50,8 +47,6 @@ struct Reduction { using ReduceScalar = Op; static void apply(Tensor& res, const Tensor& self, at::optional dim) { - internal::init_tbb_num_threads(); - auto out = res.data(); auto data = self.data(); auto numel = self.numel(); @@ -71,8 +66,8 @@ struct Reduction { } } int64_t batch = numel / (n * stride); - bool paralellize = batch * n > internal::TBB_GRAIN_SIZE; - parallel_for(batch, 1, paralellize, [=](int64_t b) { + bool paralellize = batch * n > internal::GRAIN_SIZE; + _parallel_for(batch, 1, paralellize, [=](int64_t b) { if (stride == 1) { out[b] = reduce_all(&data[b * n], n); } else { @@ -84,23 +79,17 @@ struct Reduction { static scalar_t reduce_all(const scalar_t* data, int64_t size) { int64_t k = size / WIDTH; - scalar_t sum; - if (size > internal::TBB_GRAIN_SIZE) { - sum = tbb::parallel_reduce( - tbb::blocked_range(0, k, internal::TBB_GRAIN_SIZE / WIDTH), - scalar_t(ident), - [=](const tbb::blocked_range& r, scalar_t init) { - scalar_t buf[WIDTH]; - reduce128(&data[r.begin() * WIDTH], buf, r.end() - r.begin(), WIDTH); - return std::accumulate(buf, buf + WIDTH, init, ReduceScalar()); - }, - ReduceScalar(), - ap); - } else { - scalar_t buf[WIDTH]; - reduce128(data, buf, k, WIDTH); - sum = std::accumulate(buf, buf + WIDTH, scalar_t(ident), ReduceScalar()); - } + scalar_t sum = parallel_reduce( + 0, + k, + internal::GRAIN_SIZE / WIDTH, + (scalar_t)ident, + [data](int64_t begin, int64_t end, scalar_t init) { + scalar_t buf[WIDTH]; + reduce128(&data[begin * WIDTH], buf, end - begin, WIDTH); + return std::accumulate(buf, buf + WIDTH, init, ReduceScalar()); + }, + ReduceScalar()); for (int64_t i = k * WIDTH; i != size; i++) { sum = ReduceScalar()(sum, data[i]); @@ -127,8 +116,8 @@ struct Reduction { // Reduce a 2d matrix down each column. Stores the results in out[0 ... cols-1] static void reduce2d(const scalar_t* data, scalar_t* out, int64_t rows, int64_t cols, int64_t stride) { int64_t cols_rounded = round_down(cols, WIDTH); - bool paralellize = cols * rows > internal::TBB_GRAIN_SIZE; - parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) { + bool paralellize = cols * rows > internal::GRAIN_SIZE; + _parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) { reduce128(&data[col], &out[col], rows, stride); }); diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp index 2e0a68b1fed..6cfa90ff321 100644 --- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -14,7 +14,7 @@ // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in // Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280 // -// On grainsize: The grainsize is chosen to roughly get TBB_GRAIN_SIZE number of +// On grainsize: The grainsize is chosen to roughly get GRAIN_SIZE number of // computations per task. Each task works across dim_size elements. 16 should be // a very rough approximation of the number of computations per dim_size element // by counting simple computations (*, +, -) as 1 and exp or log as 4. @@ -30,7 +30,7 @@ inline void _vec_log_softmax_lastdim( int64_t dim_size) { using Vec = vec256::Vec256; static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size; - int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); + int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); if (grain_size < CHUNK_SIZE) grain_size = CHUNK_SIZE; @@ -93,7 +93,7 @@ inline void _vec_softmax_lastdim( int64_t outer_size, int64_t dim_size) { using Vec = vec256::Vec256; - int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size); + int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); if (grain_size < 1) grain_size = 1; @@ -134,7 +134,7 @@ inline void _vec_host_softmax_backward_lastdim( int64_t outer_size, int64_t dim_size) { using Vec = vec256::Vec256; - int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size); + int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); if (grain_size < 1) grain_size = 1; diff --git a/setup.py b/setup.py index cd0bfec8516..92123bd934f 100644 --- a/setup.py +++ b/setup.py @@ -355,7 +355,6 @@ class build_deps(PytorchCommand): check_file(os.path.join(third_party_path, "nanopb", "CMakeLists.txt")) check_file(os.path.join(third_party_path, "pybind11", "CMakeLists.txt")) check_file(os.path.join(third_party_path, 'cpuinfo', 'CMakeLists.txt')) - check_file(os.path.join(third_party_path, 'tbb', 'Makefile')) check_file(os.path.join(third_party_path, 'catch', 'CMakeLists.txt')) check_file(os.path.join(third_party_path, 'onnx', 'CMakeLists.txt')) diff --git a/third_party/tbb b/third_party/tbb deleted file mode 160000 index 633b01ad27e..00000000000 --- a/third_party/tbb +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 633b01ad27e012e1dc4e392c3230250d1f4967a4 diff --git a/tools/aten_mirror.sh b/tools/aten_mirror.sh index 6c787bbda56..b1408e0658c 100755 --- a/tools/aten_mirror.sh +++ b/tools/aten_mirror.sh @@ -27,7 +27,7 @@ git fetch fullrepo git checkout -b temporary-split-branch fullrepo/master # Cribbed from https://stackoverflow.com/questions/2982055/detach-many-subdirectories-into-a-new-separate-git-repository # and https://stackoverflow.com/questions/42355621/git-filter-branch-moving-a-folder-with-index-filter-does-not-work -git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/tbb third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)' +git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)' git checkout master git merge temporary-split-branch git push