mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Remove SLEEF and all dependent code paths (#7268)
Temporarily remove this dependency.
This commit is contained in:
parent
0829d4502d
commit
bcffb5aa1d
9 changed files with 39 additions and 713 deletions
|
|
@ -17,6 +17,8 @@ ENDIF()
|
|||
IF(NOT MSVC)
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-qualifiers")
|
||||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-ignored-qualifiers")
|
||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-absolute-value")
|
||||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-absolute-value")
|
||||
ENDIF(NOT MSVC)
|
||||
|
||||
########################
|
||||
|
|
@ -366,13 +368,6 @@ if (NOT TARGET cpuinfo)
|
|||
endif()
|
||||
TARGET_LINK_LIBRARIES(ATen cpuinfo)
|
||||
|
||||
# ---[ Configure SLEEF
|
||||
IF(NOT TARGET sleef)
|
||||
add_subdirectory("cpu/sleef")
|
||||
include_directories(SYSTEM ${CMAKE_BINARY_DIR}/include)
|
||||
ENDIF()
|
||||
TARGET_LINK_LIBRARIES(ATen sleef)
|
||||
|
||||
IF(CUDA_FOUND)
|
||||
IF ($ENV{ATEN_STATIC_CUDA})
|
||||
# CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
|
||||
|
|
|
|||
|
|
@ -1,71 +0,0 @@
|
|||
IF(MSVC)
|
||||
option(BUILD_SHARED_LIBS "Build shared libs" ON)
|
||||
ELSE(MSVC)
|
||||
option(BUILD_SHARED_LIBS "Build shared libs" OFF)
|
||||
ENDIF(MSVC)
|
||||
option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF)
|
||||
|
||||
set(SLEEF_VERSION_MAJOR 3)
|
||||
set(SLEEF_VERSION_MINOR 2)
|
||||
set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})
|
||||
|
||||
# Sanity check for in-source builds which we do not want to happen
|
||||
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR)
|
||||
message(FATAL_ERROR "SLEEF does not allow in-source builds.
|
||||
You can refer to doc/build-with-cmake.md for instructions on how provide a \
|
||||
separate build directory. Note: Please remove autogenerated file \
|
||||
`CMakeCache.txt` and directory `CMakeFiles` in the current directory.")
|
||||
endif()
|
||||
|
||||
# Set output directories for the library files
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
|
||||
|
||||
foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string(TOUPPER ${CONFIG} CONFIG)
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/bin)
|
||||
endforeach(CONFIG CMAKE_CONFIGURATION_TYPES)
|
||||
|
||||
# Path for finding cmake modules
|
||||
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../third_party/sleef/cmake/Modules)
|
||||
set(SLEEF_SCRIPT_PATH ${PROJECT_SOURCE_DIR}/../third_party/sleef/cmake/Scripts CACHE PATH
|
||||
"Path for finding sleef specific cmake scripts")
|
||||
|
||||
# sleef-config.h.in passes cmake settings to the source code
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/Configure.cmake)
|
||||
configure_file(
|
||||
${PROJECT_SOURCE_DIR}/../third_party/sleef/sleef-config.h.in
|
||||
${PROJECT_BINARY_DIR}/include/sleef-config.h @ONLY)
|
||||
|
||||
# Generates object file (shared library) `libsleef`
|
||||
# Defined in src/libm/CMakeLists.txt via command add_library
|
||||
set(TARGET_LIBSLEEF "sleef")
|
||||
set(TARGET_LIBSLEEFGNUABI "sleefgnuabi")
|
||||
# Generates the sleef.h headers and all the rename headers
|
||||
# Defined in src/libm/CMakeLists.txt via custom commands and a custom target
|
||||
set(TARGET_HEADERS "headers")
|
||||
set(TARGET_MKRENAME "mkrename")
|
||||
set(TARGET_MKRENAME_GNUABI "mkrename_gnuabi")
|
||||
set(TARGET_MKMASKED_GNUABI "mkmasked_gnuabi")
|
||||
set(TARGET_MKDISP "mkdisp")
|
||||
set(TARGET_MKALIAS "mkalias")
|
||||
set(TARGET_LIBCOMMON_OBJ "common")
|
||||
set(TARGET_LIBARRAYMAP_OBJ "arraymap")
|
||||
|
||||
function(add_host_executable TARGETNAME)
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
add_executable(${TARGETNAME} ${ARGN})
|
||||
else()
|
||||
add_executable(${TARGETNAME} IMPORTED)
|
||||
set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
include_directories("${PROJECT_SOURCE_DIR}/../third_party/sleef/src/common")
|
||||
include_directories("${PROJECT_SOURCE_DIR}/../third_party/sleef/src/arch")
|
||||
|
||||
add_subdirectory("${PROJECT_SOURCE_DIR}/../third_party/sleef/src/libm" "${CMAKE_CURRENT_BINARY_DIR}/sleef/libm")
|
||||
add_subdirectory("${PROJECT_SOURCE_DIR}/../third_party/sleef/src/common" "${CMAKE_CURRENT_BINARY_DIR}/sleef/common")
|
||||
|
|
@ -1,373 +0,0 @@
|
|||
include(CheckCCompilerFlag)
|
||||
include(CheckCSourceCompiles)
|
||||
include(CheckTypeSize)
|
||||
|
||||
# Some toolchains require explicit linking of the libraries following.
|
||||
find_library(LIB_MPFR mpfr)
|
||||
find_library(LIBM m)
|
||||
find_library(LIBGMP gmp)
|
||||
find_library(LIBRT rt)
|
||||
|
||||
find_path(MPFR_INCLUDE_DIR
|
||||
NAMES mpfr.h
|
||||
ONLY_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
if (NOT LIBM)
|
||||
set(LIBM "")
|
||||
endif()
|
||||
|
||||
if (NOT LIBRT)
|
||||
set(LIBRT "")
|
||||
endif()
|
||||
|
||||
# The library currently supports the following SIMD architectures
|
||||
set(SLEEF_SUPPORTED_EXTENSIONS
|
||||
AVX2 AVX2128 AVX SSE4 SSE2 # x86
|
||||
ADVSIMD SVE # Aarch64
|
||||
NEON32 # Aarch32
|
||||
CACHE STRING "List of SIMD architectures supported by libsleef."
|
||||
)
|
||||
set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS
|
||||
SSE2 AVX AVX2 ADVSIMD SVE
|
||||
CACHE STRING "List of SIMD architectures supported by libsleef for GNU ABI."
|
||||
)
|
||||
|
||||
# Force set default build type if none was specified
|
||||
# Note: some sleef code requires the optimisation flags turned on
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
message(STATUS "Setting build type to 'Release' (required for full support).")
|
||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
|
||||
"Debug" "Release" "RelWithDebInfo" "MinSizeRel")
|
||||
endif()
|
||||
|
||||
# Function used to generate safe command arguments for add_custom_command
|
||||
function(command_arguments PROPNAME)
|
||||
set(quoted_args "")
|
||||
foreach(arg ${ARGN})
|
||||
list(APPEND quoted_args "\"${arg}\"" )
|
||||
endforeach()
|
||||
set(${PROPNAME} ${quoted_args} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
# PLATFORM DETECTION
|
||||
if((CMAKE_SYSTEM_PROCESSOR MATCHES "x86") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64"))
|
||||
set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.")
|
||||
|
||||
set(SLEEF_HEADER_LIST
|
||||
SSE_
|
||||
SSE2
|
||||
SSE4
|
||||
AVX_
|
||||
AVX
|
||||
AVX2
|
||||
AVX2128
|
||||
)
|
||||
command_arguments(HEADER_PARAMS_SSE_ 2 4 __m128d __m128 __m128i __m128i __SSE2__)
|
||||
command_arguments(HEADER_PARAMS_SSE2 2 4 __m128d __m128 __m128i __m128i __SSE2__ sse2)
|
||||
command_arguments(HEADER_PARAMS_SSE4 2 4 __m128d __m128 __m128i __m128i __SSE2__ sse4)
|
||||
command_arguments(HEADER_PARAMS_AVX_ 4 8 __m256d __m256 __m128i "struct { __m128i x, y$<SEMICOLON> }" __AVX__)
|
||||
command_arguments(HEADER_PARAMS_AVX 4 8 __m256d __m256 __m128i "struct { __m128i x, y$<SEMICOLON> }" __AVX__ avx)
|
||||
command_arguments(HEADER_PARAMS_AVX2 4 8 __m256d __m256 __m128i __m256i __AVX__ avx2)
|
||||
command_arguments(HEADER_PARAMS_AVX2128 2 4 __m128d __m128 __m128i __m128i __SSE2__ avx2128)
|
||||
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.")
|
||||
# Aarch64 requires support for advsimdfma4
|
||||
set(COMPILER_SUPPORTS_ADVSIMD 1)
|
||||
|
||||
set(SLEEF_HEADER_LIST
|
||||
ADVSIMD_
|
||||
ADVSIMD
|
||||
SVE
|
||||
)
|
||||
command_arguments(HEADER_PARAMS_ADVSIMD_ 2 4 float64x2_t float32x4_t int32x2_t int32x4_t __ARM_NEON)
|
||||
command_arguments(HEADER_PARAMS_ADVSIMD 2 4 float64x2_t float32x4_t int32x2_t int32x4_t __ARM_NEON advsimd)
|
||||
command_arguments(HEADER_PARAMS_SVE 2 4 svfloat64_t svfloat32_t svint32_t svint32_t __ARM_FEATURE_SVE sve)
|
||||
|
||||
command_arguments(ALIAS_PARAMS_ADVSIMD_DP 2 float64x2_t int32x2_t n advsimd)
|
||||
command_arguments(ALIAS_PARAMS_ADVSIMD_SP -4 float32x4_t int32x4_t n advsimd)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
|
||||
set(SLEEF_ARCH_AARCH32 ON CACHE INTERNAL "True for Aarch32 architecture.")
|
||||
set(COMPILER_SUPPORTS_NEON32 1)
|
||||
|
||||
set(SLEEF_HEADER_LIST
|
||||
NEON32_
|
||||
NEON32
|
||||
)
|
||||
command_arguments(HEADER_PARAMS_NEON32_ 2 4 - float32x4_t int32x2_t int32x4_t __ARM_NEON__)
|
||||
command_arguments(HEADER_PARAMS_NEON32 2 4 - float32x4_t int32x2_t int32x4_t __ARM_NEON__ neon)
|
||||
|
||||
command_arguments(ALIAS_PARAMS_NEON32_SP -4 float32x4_t int32x4_t - neon)
|
||||
command_arguments(ALIAS_PARAMS_NEON32_DP 0)
|
||||
endif()
|
||||
|
||||
# MKRename arguments per type
|
||||
command_arguments(RENAME_PARAMS_SSE2 2 4 sse2)
|
||||
command_arguments(RENAME_PARAMS_SSE4 2 4 sse4)
|
||||
command_arguments(RENAME_PARAMS_AVX 4 8 avx)
|
||||
command_arguments(RENAME_PARAMS_AVX2 4 8 avx2)
|
||||
command_arguments(RENAME_PARAMS_AVX2128 2 4 avx2128)
|
||||
command_arguments(RENAME_PARAMS_ADVSIMD 2 4 advsimd)
|
||||
command_arguments(RENAME_PARAMS_NEON32 2 4 neon)
|
||||
# The vector length parameters in SVE, for SP and DP, are chosen for
|
||||
# the smallest SVE vector size (128-bit). The name is generated using
|
||||
# the "x" token of VLA SVE vector functions.
|
||||
command_arguments(RENAME_PARAMS_SVE 2 4 sve)
|
||||
|
||||
command_arguments(RENAME_PARAMS_GNUABI_SSE2 sse2 b 2 4 _mm128d _mm128 _mm128i _mm128i __SSE2__)
|
||||
command_arguments(RENAME_PARAMS_GNUABI_AVX avx c 4 8 __m256d __m256 __m128i "struct { __m128i x, y$<SEMICOLON> }" __AVX__)
|
||||
command_arguments(RENAME_PARAMS_GNUABI_AVX2 avx2 d 4 8 __m256d __m256 __m128i __m256i __AVX2__)
|
||||
command_arguments(RENAME_PARAMS_GNUABI_ADVSIMD advsimd n 2 4 float64x2_t float32x4_t int32x2_t int32x4_t __ARM_NEON)
|
||||
# The vector length parameters in SVE, for SP and DP, are chosen for
|
||||
# the smallest SVE vector size (128-bit). The name is generated using
|
||||
# the "x" token of VLA SVE vector functions.
|
||||
command_arguments(RENAME_PARAMS_GNUABI_SVE sve s 2 4 svfloat64_t svfloat32_t svint32_t svint32_t __ARM_SVE)
|
||||
|
||||
|
||||
command_arguments(MKMASKED_PARAMS_GNUABI_SVE_dp sve s 2)
|
||||
command_arguments(MKMASKED_PARAMS_GNUABI_SVE_sp sve s -4)
|
||||
|
||||
# COMPILER DETECTION
|
||||
|
||||
# Detect CLANG executable path (on both Windows and Linux/OSX)
|
||||
if(NOT CLANG_EXE_PATH)
|
||||
# If the current compiler used by CMAKE is already clang, use this one directly
|
||||
if(CMAKE_C_COMPILER MATCHES "clang")
|
||||
set(CLANG_EXE_PATH ${CMAKE_C_COMPILER})
|
||||
else()
|
||||
# Else we may find clang on the path?
|
||||
find_program(CLANG_EXE_PATH NAMES clang "clang-5.0" "clang-4.0" "clang-3.9")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Allow to define the Gcc/Clang here
|
||||
# As we might compile the lib with MSVC, but generates bitcode with CLANG
|
||||
# Intel vector extensions.
|
||||
set(CLANG_FLAGS_ENABLE_SSE2 "-msse2")
|
||||
set(CLANG_FLAGS_ENABLE_SSE4 "-msse4.1")
|
||||
set(CLANG_FLAGS_ENABLE_AVX "-mavx")
|
||||
set(CLANG_FLAGS_ENABLE_AVX2 "-mavx2;-mfma")
|
||||
set(CLANG_FLAGS_ENABLE_AVX2128 "-mavx2;-mfma")
|
||||
set(CLANG_FLAGS_ENABLE_NEON32 "--target=arm-linux-gnueabihf;-mcpu=cortex-a8")
|
||||
# Arm AArch64 vector extensions.
|
||||
set(CLANG_FLAGS_ENABLE_ADVSIMD "-march=armv8-a+simd")
|
||||
set(CLANG_FLAGS_ENABLE_SVE "-march=armv8-a+sve")
|
||||
|
||||
# All variables storing compiler flags should be prefixed with FLAGS_
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
|
||||
# Always compile sleef with -ffp-contract.
|
||||
set(FLAGS_STRICTMATH "-ffp-contract=off")
|
||||
set(FLAGS_FASTMATH "-ffast-math")
|
||||
|
||||
# Without the options below, gcc generates calls to libm
|
||||
set(FLAGS_NO_ERRNO "-fno-math-errno -fno-trapping-math")
|
||||
|
||||
# Intel vector extensions.
|
||||
foreach(SIMD ${SLEEF_SUPPORTED_EXTENSIONS})
|
||||
set(FLAGS_ENABLE_${SIMD} ${CLANG_FLAGS_ENABLE_${SIMD}})
|
||||
endforeach()
|
||||
|
||||
# Warning flags.
|
||||
set(FLAGS_WALL "-Wall -Wno-unused -Wno-attributes -Wno-unused-result")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
# The following compiler option is needed to suppress the warning
|
||||
# "AVX vector return without AVX enabled changes the ABI" at
|
||||
# src/arch/helpervecext.h:88
|
||||
string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -Wno-psabi")
|
||||
set(FLAGS_ENABLE_NEON32 "-mfpu=neon")
|
||||
endif(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
elseif(MSVC)
|
||||
# Intel vector extensions.
|
||||
set(FLAGS_ENABLE_SSE2 /D__SSE2__)
|
||||
set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__)
|
||||
set(FLAGS_ENABLE_AVX /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /arch:AVX)
|
||||
set(FLAGS_ENABLE_AVX2 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
|
||||
set(FLAGS_ENABLE_AVX2128 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
|
||||
set(FLAGS_WALL "/D_CRT_SECURE_NO_WARNINGS")
|
||||
set(FLAGS_NO_ERRNO "")
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
set(FLAGS_ENABLE_SSE2 "-msse2")
|
||||
set(FLAGS_ENABLE_SSE4 "-msse4.1")
|
||||
set(FLAGS_ENABLE_AVX "-mavx")
|
||||
set(FLAGS_ENABLE_AVX2 "-march=core-avx2")
|
||||
set(FLAGS_ENABLE_AVX2128 "-march=core-avx2")
|
||||
set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_type -qoverride-limits")
|
||||
set(FLAGS_FASTMATH "-fp-model fast=2 -Qoption,cpp,--extended_float_type -qoverride-limits")
|
||||
set(FLAGS_WALL "-fmax-errors=3 -Wall -Wno-unused -Wno-attributes")
|
||||
set(FLAGS_NO_ERRNO "")
|
||||
endif()
|
||||
|
||||
set(SLEEF_C_FLAGS "${FLAGS_WALL} ${FLAGS_STRICTMATH} ${FLAGS_NO_ERRNO}")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99)
|
||||
set(DFT_C_FLAGS "${FLAGS_WALL}")
|
||||
else()
|
||||
set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_FASTMATH}")
|
||||
endif()
|
||||
|
||||
if(CYGWIN OR MINGW)
|
||||
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-asynchronous-unwind-tables")
|
||||
set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-asynchronous-unwind-tables")
|
||||
endif()
|
||||
|
||||
# FEATURE DETECTION
|
||||
|
||||
CHECK_TYPE_SIZE("long double" LD_SIZE)
|
||||
if(LD_SIZE GREATER "9")
|
||||
# This is needed to check since internal compiler error occurs with gcc 4.x
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*2)));
|
||||
vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
|
||||
int main() { vlongdouble vld = vcast_vl_l(0);
|
||||
}" COMPILER_SUPPORTS_LONG_DOUBLE)
|
||||
endif()
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
int main() { __float128 r = 1;
|
||||
}" COMPILER_SUPPORTS_FLOAT128)
|
||||
|
||||
# Detect if sleef supported architectures are also supported by the compiler
|
||||
|
||||
set (CMAKE_REQUIRED_FLAGS ${FLAGS_ENABLE_SSE2})
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m128d r = _mm_mul_pd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
|
||||
COMPILER_SUPPORTS_SSE2)
|
||||
|
||||
set (CMAKE_REQUIRED_FLAGS ${FLAGS_ENABLE_SSE4})
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m128d r = _mm_floor_sd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
|
||||
COMPILER_SUPPORTS_SSE4)
|
||||
|
||||
set (CMAKE_REQUIRED_FLAGS ${FLAGS_ENABLE_AVX})
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m256d r = _mm256_add_pd(_mm256_set1_pd(1), _mm256_set1_pd(2));
|
||||
}" COMPILER_SUPPORTS_AVX)
|
||||
|
||||
set (CMAKE_REQUIRED_FLAGS ${FLAGS_ENABLE_AVX2})
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m256i r = _mm256_abs_epi32(_mm256_set1_epi32(1)); }"
|
||||
COMPILER_SUPPORTS_AVX2)
|
||||
|
||||
set (CMAKE_REQUIRED_FLAGS ${FLAGS_ENABLE_SVE})
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <arm_sve.h>
|
||||
int main() {
|
||||
svint32_t r = svdup_n_s32(1); }"
|
||||
COMPILER_SUPPORTS_SVE)
|
||||
|
||||
# AVX2 implies AVX2128
|
||||
if(COMPILER_SUPPORTS_AVX2)
|
||||
set(COMPILER_SUPPORTS_AVX2128 1)
|
||||
endif()
|
||||
|
||||
# Check if compilation with OpenMP really succeeds
|
||||
# It does not succeed on Travis even though find_package(OpenMP) succeeds.
|
||||
find_package(OpenMP)
|
||||
if(OPENMP_FOUND)
|
||||
set (CMAKE_REQUIRED_FLAGS "${OpenMP_C_FLAGS}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <stdio.h>
|
||||
int main() {
|
||||
int i;
|
||||
#pragma omp parallel for
|
||||
for(i=0;i < 10;i++) { putchar(0); }
|
||||
}"
|
||||
COMPILER_SUPPORTS_OPENMP)
|
||||
endif(OPENMP_FOUND)
|
||||
|
||||
# Check weak aliases are supported.
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(__CYGWIN__)
|
||||
#define EXPORT __stdcall __declspec(dllexport)
|
||||
#else
|
||||
#define EXPORT
|
||||
#endif
|
||||
EXPORT int f(int a) {
|
||||
return a + 2;
|
||||
}
|
||||
EXPORT int g(int a) __attribute__((weak, alias(\"f\")));
|
||||
int main(void) {
|
||||
return g(2);
|
||||
}"
|
||||
COMPILER_SUPPORTS_WEAK_ALIASES)
|
||||
if (COMPILER_SUPPORTS_WEAK_ALIASES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND NOT MINGW)
|
||||
set(ENABLE_GNUABI ${COMPILER_SUPPORTS_WEAK_ALIASES})
|
||||
endif()
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
int main(void) {
|
||||
double a = __builtin_sqrt (2);
|
||||
float b = __builtin_sqrtf(2);
|
||||
}"
|
||||
COMPILER_SUPPORTS_BUILTIN_MATH)
|
||||
|
||||
# Reset used flags
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
# Save the default C flags
|
||||
set(ORG_CMAKE_C_FLAGS CMAKE_C_FLAGS)
|
||||
|
||||
# Check if sde64 command is available
|
||||
|
||||
find_program(SDE_COMMAND sde64)
|
||||
if (NOT SDE_COMMAND)
|
||||
find_program(SDE_COMMAND sde)
|
||||
endif()
|
||||
|
||||
# Check if armie command is available
|
||||
|
||||
find_program(ARMIE_COMMAND armie)
|
||||
if (NOT SVE_VECTOR_BITS)
|
||||
set(SVE_VECTOR_BITS 128)
|
||||
endif()
|
||||
##
|
||||
|
||||
if(SLEEF_SHOW_ERROR_LOG)
|
||||
if (EXISTS ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log)
|
||||
file(READ ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log FILE_CONTENT)
|
||||
message("${FILE_CONTENT}")
|
||||
endif()
|
||||
endif(SLEEF_SHOW_ERROR_LOG)
|
||||
|
||||
# Detect if cmake is running on Travis
|
||||
string(COMPARE NOTEQUAL "" "$ENV{TRAVIS}" RUNNING_ON_TRAVIS)
|
||||
|
||||
if (${RUNNING_ON_TRAVIS} AND CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
message("Travix bug workaround turned on")
|
||||
set(COMPILER_SUPPORTS_OPENMP FALSE) # Workaround for https://github.com/travis-ci/travis-ci/issues/8613
|
||||
set(COMPILER_SUPPORTS_FLOAT128 FALSE) # Compilation on unroll_0_vecextqp.c does not finish on Travis
|
||||
endif()
|
||||
|
||||
# Set common definitions
|
||||
|
||||
if (NOT BUILD_SHARED_LIBS)
|
||||
set(COMMON_TARGET_DEFINITIONS SLEEF_STATIC_LIBS=1)
|
||||
endif()
|
||||
|
||||
if (COMPILER_SUPPORTS_WEAK_ALIASES)
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_ALIAS=1)
|
||||
endif()
|
||||
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
#include "intrinsics.h"
|
||||
#include "vec256_base.h"
|
||||
#include <sleef.h>
|
||||
|
||||
namespace at {
|
||||
namespace vec256 {
|
||||
|
|
@ -55,36 +54,6 @@ public:
|
|||
auto mask = _mm256_set1_pd(-0.f);
|
||||
return _mm256_andnot_pd(mask, values);
|
||||
}
|
||||
Vec256<double> acos() const {
|
||||
return Vec256<double>(Sleef_acosd4_u10(values));
|
||||
}
|
||||
Vec256<double> asin() const {
|
||||
return Vec256<double>(Sleef_asind4_u10(values));
|
||||
}
|
||||
Vec256<double> atan() const {
|
||||
return Vec256<double>(Sleef_atand4_u10(values));
|
||||
}
|
||||
Vec256<double> erf() const {
|
||||
return Vec256<double>(Sleef_erfd4_u10(values));
|
||||
}
|
||||
Vec256<double> exp() const {
|
||||
return Vec256<double>(Sleef_expd4_u10(values));
|
||||
}
|
||||
Vec256<double> expm1() const {
|
||||
return Vec256<double>(Sleef_expm1d4_u10(values));
|
||||
}
|
||||
Vec256<double> log() const {
|
||||
return Vec256<double>(Sleef_logd4_u10(values));
|
||||
}
|
||||
Vec256<double> log2() const {
|
||||
return Vec256<double>(Sleef_log2d4_u10(values));
|
||||
}
|
||||
Vec256<double> log10() const {
|
||||
return Vec256<double>(Sleef_log10d4_u10(values));
|
||||
}
|
||||
Vec256<double> log1p() const {
|
||||
return Vec256<double>(Sleef_log1pd4_u10(values));
|
||||
}
|
||||
Vec256<double> sin() const {
|
||||
return map(std::sin);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
#include "intrinsics.h"
|
||||
#include "vec256_base.h"
|
||||
#include <sleef.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace at {
|
||||
|
|
@ -56,36 +55,6 @@ public:
|
|||
auto mask = _mm256_set1_ps(-0.f);
|
||||
return _mm256_andnot_ps(mask, values);
|
||||
}
|
||||
Vec256<float> acos() const {
|
||||
return Vec256<float>(Sleef_acosf8_u10(values));
|
||||
}
|
||||
Vec256<float> asin() const {
|
||||
return Vec256<float>(Sleef_asinf8_u10(values));
|
||||
}
|
||||
Vec256<float> atan() const {
|
||||
return Vec256<float>(Sleef_atanf8_u10(values));
|
||||
}
|
||||
Vec256<float> erf() const {
|
||||
return Vec256<float>(Sleef_erff8_u10(values));
|
||||
}
|
||||
Vec256<float> exp() const {
|
||||
return Vec256<float>(Sleef_expf8_u10(values));
|
||||
}
|
||||
Vec256<float> expm1() const {
|
||||
return Vec256<float>(Sleef_expm1f8_u10(values));
|
||||
}
|
||||
Vec256<float> log() const {
|
||||
return Vec256<float>(Sleef_logf8_u10(values));
|
||||
}
|
||||
Vec256<float> log2() const {
|
||||
return Vec256<float>(Sleef_log2f8_u10(values));
|
||||
}
|
||||
Vec256<float> log10() const {
|
||||
return Vec256<float>(Sleef_log10f8_u10(values));
|
||||
}
|
||||
Vec256<float> log1p() const {
|
||||
return Vec256<float>(Sleef_log1pf8_u10(values));
|
||||
}
|
||||
Vec256<float> sin() const {
|
||||
return map(std::sin);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@
|
|||
|
||||
#include "ATen/CPUApplyUtils.h"
|
||||
#include "ATen/Parallel.h"
|
||||
#include "ATen/native/cpu/UnaryOpsKernel.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
|
@ -66,36 +65,6 @@ Tensor& fill_(Tensor& self, const Tensor& value) {
|
|||
return result; \
|
||||
}
|
||||
|
||||
#define IMPLEMENT_UNARY_OP_VEC(op, opfn) \
|
||||
Tensor& _##op##__cpu(Tensor& self_) { \
|
||||
if (self_.numel() > 0) { \
|
||||
Tensor self = sort_strides(self_); \
|
||||
if (self.is_contiguous()) { \
|
||||
op##Impl(self, self); \
|
||||
} else { \
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), op, [&] { \
|
||||
CPU_tensor_parallel_apply1<scalar_t>( \
|
||||
self, [](scalar_t& y) { y = opfn(y); }); \
|
||||
}); \
|
||||
} \
|
||||
} \
|
||||
return self_; \
|
||||
} \
|
||||
Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \
|
||||
result.resize_(self.sizes()); \
|
||||
if (result.numel() > 0) { \
|
||||
if (result.is_contiguous() && self.is_contiguous()) { \
|
||||
op##Impl(result, self); \
|
||||
} else { \
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), op, [&] { \
|
||||
CPU_tensor_parallel_apply2<scalar_t, scalar_t>( \
|
||||
result, self, [](scalar_t& y, scalar_t& x) { y = opfn(x); }); \
|
||||
}); \
|
||||
} \
|
||||
} \
|
||||
return result; \
|
||||
}
|
||||
|
||||
IMPLEMENT_UNARY_OP_PREQUEL(abs)
|
||||
IMPLEMENT_UNARY_OP_PREQUEL(acos)
|
||||
IMPLEMENT_UNARY_OP_PREQUEL(asin)
|
||||
|
|
@ -130,28 +99,48 @@ Tensor& _tanh_out_cuda(Tensor& result, const Tensor& self) {
|
|||
return at::_th_tanh_out(result, self);
|
||||
}
|
||||
|
||||
IMPLEMENT_UNARY_OP_VEC(abs, std::abs)
|
||||
IMPLEMENT_UNARY_OP_VEC(acos, std::acos)
|
||||
IMPLEMENT_UNARY_OP_VEC(asin, std::asin)
|
||||
IMPLEMENT_UNARY_OP_VEC(atan, std::atan)
|
||||
IMPLEMENT_UNARY_OP_VEC(ceil, std::ceil)
|
||||
Tensor& _abs__cpu(Tensor& self_) {
|
||||
if (self_.numel() > 0) {
|
||||
Tensor self = sort_strides(self_);
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), abs, [&] {
|
||||
CPU_tensor_parallel_apply1<scalar_t>(
|
||||
self, [](scalar_t& y) { y = std::abs(y); });
|
||||
});
|
||||
}
|
||||
return self_;
|
||||
}
|
||||
Tensor& _abs_out_cpu(Tensor& result, const Tensor& self) {
|
||||
result.resize_(self.sizes());
|
||||
if (result.numel() > 0) {
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), abs, [&] {
|
||||
CPU_tensor_parallel_apply2<scalar_t, scalar_t>(
|
||||
result, self, [](scalar_t& y, scalar_t& x) { y = std::abs(x); });
|
||||
});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(acos, std::acos)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(asin, std::asin)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(atan, std::atan)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(ceil, std::ceil)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(cos, std::cos)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(cosh, std::cosh)
|
||||
IMPLEMENT_UNARY_OP_VEC(erf, std::erf)
|
||||
IMPLEMENT_UNARY_OP_VEC(exp, std::exp)
|
||||
IMPLEMENT_UNARY_OP_VEC(expm1, std::expm1)
|
||||
IMPLEMENT_UNARY_OP_VEC(floor, std::floor)
|
||||
IMPLEMENT_UNARY_OP_VEC(log, std::log)
|
||||
IMPLEMENT_UNARY_OP_VEC(log10, std::log10)
|
||||
IMPLEMENT_UNARY_OP_VEC(log1p, std::log1p)
|
||||
IMPLEMENT_UNARY_OP_VEC(log2, std::log2)
|
||||
IMPLEMENT_UNARY_OP_VEC(round, std::round)
|
||||
IMPLEMENT_UNARY_OP_VEC(rsqrt, 1 / std::sqrt)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(erf, std::erf)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(exp, std::exp)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(expm1, std::expm1)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(floor, std::floor)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(log, std::log)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(log10, std::log10)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(log1p, std::log1p)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(log2, std::log2)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(round, std::round)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(rsqrt, 1 / std::sqrt)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(sin, std::sin)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(sinh, std::sinh)
|
||||
IMPLEMENT_UNARY_OP_VEC(sqrt, std::sqrt)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(sqrt, std::sqrt)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(tan, std::tan)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(tanh, std::tanh)
|
||||
IMPLEMENT_UNARY_OP_VEC(trunc, std::trunc)
|
||||
IMPLEMENT_UNARY_OP_FLOAT_CMATH(trunc, std::trunc)
|
||||
}
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -1,102 +0,0 @@
|
|||
#include "ATen/native/cpu/UnaryOpsKernel.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include "ATen/Dispatch.h"
|
||||
#include "ATen/Parallel.h"
|
||||
#include "ATen/cpu/vec256/vec256.h"
|
||||
#include "ATen/native/cpu/CapabilityDispatch.h"
|
||||
|
||||
namespace at { namespace native {
|
||||
namespace {
|
||||
|
||||
using namespace vec256;
|
||||
|
||||
template <typename scalar_t, typename F>
|
||||
static void
|
||||
unary_kernel(scalar_t* arr_out, const scalar_t* arr_in, int64_t size, F func) {
|
||||
using Vec = Vec256<scalar_t>;
|
||||
int64_t size_rounded = size - (size % Vec::size);
|
||||
int64_t k = 0;
|
||||
for (; k != size_rounded; k += Vec::size) {
|
||||
auto value = func(Vec::s_load(arr_in + k));
|
||||
value.store(arr_out + k);
|
||||
}
|
||||
auto leftover = size - k;
|
||||
if (leftover > 0) {
|
||||
Vec a;
|
||||
a.load_partial(arr_in + k, leftover);
|
||||
func(a).store_partial(arr_out + k, leftover);
|
||||
}
|
||||
}
|
||||
|
||||
template <class scalar_t, class F>
|
||||
static void parallel_apply(Tensor& result, const Tensor& self, F f) {
|
||||
internal::init_tbb_num_threads();
|
||||
|
||||
static tbb::affinity_partitioner ap;
|
||||
|
||||
auto arr_out = result.data<scalar_t>();
|
||||
auto arr_in = self.data<scalar_t>();
|
||||
int64_t size = self.numel();
|
||||
if (size < internal::TBB_GRAIN_SIZE) {
|
||||
unary_kernel(arr_out, arr_in, size, f);
|
||||
} else {
|
||||
tbb::parallel_for(
|
||||
tbb::blocked_range<int64_t>(0, size, internal::TBB_GRAIN_SIZE),
|
||||
[&](const tbb::blocked_range<int64_t>& r) {
|
||||
auto size = r.end() - r.begin();
|
||||
unary_kernel(arr_out + r.begin(), arr_in + r.begin(), size, f);
|
||||
},
|
||||
ap);
|
||||
}
|
||||
}
|
||||
|
||||
static void abs_kernel(Tensor& result, const Tensor& self) {
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), "abs", [&] {
|
||||
parallel_apply<scalar_t>(
|
||||
result,
|
||||
self,
|
||||
[](const Vec256<scalar_t>& x) { return x.abs(); }); });
|
||||
}
|
||||
|
||||
static void rsqrt_kernel(Tensor& result, const Tensor& self) {
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), "rsqrt", [&] {
|
||||
parallel_apply<scalar_t>(
|
||||
result,
|
||||
self,
|
||||
[](const Vec256<scalar_t>& x) { return Vec256<scalar_t>((scalar_t)(1)) / x.sqrt(); }); });
|
||||
}
|
||||
|
||||
#define IMPLEMENT_FLOAT_KERNEL(op) \
|
||||
static void op##_kernel(Tensor& result, const Tensor& self) { \
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), #op, [&] { \
|
||||
parallel_apply<scalar_t>( \
|
||||
result, self, [](const Vec256<scalar_t>& x) { return x.op(); }); \
|
||||
}); \
|
||||
} \
|
||||
REGISTER_DISPATCH(op##Impl, &op##_kernel)
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
|
||||
REGISTER_DISPATCH(absImpl, &abs_kernel);
|
||||
REGISTER_DISPATCH(rsqrtImpl, &rsqrt_kernel);
|
||||
|
||||
IMPLEMENT_FLOAT_KERNEL(acos)
|
||||
IMPLEMENT_FLOAT_KERNEL(asin)
|
||||
IMPLEMENT_FLOAT_KERNEL(atan)
|
||||
IMPLEMENT_FLOAT_KERNEL(erf)
|
||||
IMPLEMENT_FLOAT_KERNEL(exp)
|
||||
IMPLEMENT_FLOAT_KERNEL(expm1)
|
||||
IMPLEMENT_FLOAT_KERNEL(log)
|
||||
IMPLEMENT_FLOAT_KERNEL(log10)
|
||||
IMPLEMENT_FLOAT_KERNEL(log1p)
|
||||
IMPLEMENT_FLOAT_KERNEL(log2)
|
||||
IMPLEMENT_FLOAT_KERNEL(ceil)
|
||||
IMPLEMENT_FLOAT_KERNEL(floor)
|
||||
IMPLEMENT_FLOAT_KERNEL(round)
|
||||
IMPLEMENT_FLOAT_KERNEL(sqrt)
|
||||
IMPLEMENT_FLOAT_KERNEL(trunc)
|
||||
|
||||
}} // namespace at::native
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <stdexcept>
|
||||
#include "CapabilityDispatch.h"
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
using unary_fn = void(*)(Tensor&, const Tensor&);
|
||||
|
||||
extern DispatchStub<unary_fn> absImpl;
|
||||
extern DispatchStub<unary_fn> acosImpl;
|
||||
extern DispatchStub<unary_fn> asinImpl;
|
||||
extern DispatchStub<unary_fn> atanImpl;
|
||||
extern DispatchStub<unary_fn> ceilImpl;
|
||||
extern DispatchStub<unary_fn> erfImpl;
|
||||
extern DispatchStub<unary_fn> expImpl;
|
||||
extern DispatchStub<unary_fn> expm1Impl;
|
||||
extern DispatchStub<unary_fn> fracImpl;
|
||||
extern DispatchStub<unary_fn> floorImpl;
|
||||
extern DispatchStub<unary_fn> logImpl;
|
||||
extern DispatchStub<unary_fn> log10Impl;
|
||||
extern DispatchStub<unary_fn> log1pImpl;
|
||||
extern DispatchStub<unary_fn> log2Impl;
|
||||
extern DispatchStub<unary_fn> roundImpl;
|
||||
extern DispatchStub<unary_fn> rsqrtImpl;
|
||||
extern DispatchStub<unary_fn> sqrtImpl;
|
||||
extern DispatchStub<unary_fn> truncImpl;
|
||||
|
||||
|
||||
// Missing unary functions
|
||||
// digamma
|
||||
// lgamma
|
||||
|
||||
// TODO: See below
|
||||
// erfinv
|
||||
// fill
|
||||
// frac
|
||||
// clone
|
||||
// contiguous
|
||||
// clamp/_min/_max
|
||||
// neg
|
||||
// reciprocal
|
||||
// sigmoid
|
||||
// sign
|
||||
// zero
|
||||
|
||||
|
||||
}} // namespace at::native
|
||||
1
third_party/sleef
vendored
1
third_party/sleef
vendored
|
|
@ -1 +0,0 @@
|
|||
Subproject commit e4217b4fdcfc47b0b073d490c0ddeef5f0eb5fc9
|
||||
Loading…
Reference in a new issue