pytorch/cmake/Codegen.cmake
Abdelrauf 95a1725a4a Vsx initial support issue27678 (#41541)
Summary:
### Pytorch Vec256 ppc64le support
implemented types:

- double
- float
- int16
- int32
- int64
- qint32
- qint8
- quint8
- complex_float
- complex_double

Notes:
All basic vector operations are implemented:
There are a few problems:
- minimum maximum nan propagation for ppc64le is missing and was not checked
- complex multiplication, division, sqrt, abs are implemented as PyTorch x86. they can overflow and have precision problems than std ones.  That's why they were either excluded or tested in smaller domain range
- precisions of the implemented float math functions

~~Besides, I added CPU_CAPABILITY for power. but as because of  quantization errors for DEFAULT I had to undef and  use vsx for DEFAULT too~~

#### Details
##### Supported math functions

+ plus sign means vectorized, -  minus sign means missing,   (implementation notes are added inside braces)
(notes). Example: -(both ) means it was also missing on x86 side
g( func_name)  means vectorization is using func_name
sleef - redirected to the Sleef
unsupported

function_name | float | double | complex float | complex double
|-- | -- | -- | -- | --|
acos | sleef | sleef | f(asin) | f(asin)
asin | sleef | sleef | +(pytorch impl) | +(pytorch impl)
atan | sleef | sleef | f(log) | f(log)
atan2 | sleef | sleef | unsupported | unsupported
cos | +((ppc64le:avx_mathfun) ) | sleef | -(both) | -(both)
cosh | f(exp)   | -(both) | -(both) |
erf | sleef | sleef | unsupported | unsupported
erfc | sleef | sleef | unsupported | unsupported
erfinv | - (both) | - (both) | unsupported | unsupported
exp | + | sleef | - (x86:f()) | - (x86:f())
expm1 | f(exp)  | sleef | unsupported | unsupported
lgamma | sleef | sleef |   |
log | +  | sleef | -(both) | -(both)
log10 | f(log)  | sleef | f(log) | f(log)
log1p | f(log)  | sleef | unsupported | unsupported
log2 | f(log)  | sleef | f(log) | f(log)
pow | + f(exp)  | sleef | -(both) | -(both)
sin | +((ppc64le:avx_mathfun) ) | sleef | -(both) | -(both)
sinh | f(exp)  | sleef | -(both) | -(both)
tan | sleef | sleef | -(both) | -(both)
tanh | f(exp)  | sleef | -(both) | -(both)
hypot | sleef | sleef | -(both) | -(both)
nextafter | sleef  | sleef | -(both) | -(both)
fmod | sleef | sleef | -(both) | -(both)

[Vec256 Test cases Pr https://github.com/pytorch/pytorch/issues/42685](https://github.com/pytorch/pytorch/pull/42685)
Current list:

- [x] Blends
- [x] Memory: UnAlignedLoadStore
- [x] Arithmetics: Plus,Minu,Multiplication,Division
- [x] Bitwise: BitAnd, BitOr, BitXor
- [x] Comparison: Equal, NotEqual, Greater, Less, GreaterEqual, LessEqual
- [x] MinMax: Minimum, Maximum, ClampMin, ClampMax, Clamp
- [x] SignManipulation: Absolute, Negate
- [x] Interleave: Interleave, DeInterleave
- [x] Rounding: Round, Ceil, Floor, Trunc
- [x] Mask: ZeroMask
- [x] SqrtAndReciprocal: Sqrt, RSqrt, Reciprocal
- [x] Trigonometric: Sin, Cos, Tan
- [x] Hyperbolic: Tanh, Sinh, Cosh
- [x] InverseTrigonometric: Asin, ACos, ATan, ATan2
- [x] Logarithm: Log, Log2, Log10, Log1p
- [x] Exponents: Exp, Expm1
- [x] ErrorFunctions: Erf, Erfc, Erfinv
- [x] Pow: Pow
- [x] LGamma: LGamma
- [x] Quantization: quantize, dequantize, requantize_from_int
- [x] Quantization: widening_subtract, relu, relu6
Missing:
- [ ] Constructors, initializations
- [ ] Conversion , Cast
- [ ] Additional: imag, conj, angle (note: imag and conj only checked for float complex)

#### Notes on tests and testing framework
- some math functions are tested within domain range
- mostly testing framework randomly tests against std implementation within the domain or within the implementation domain for some math functions.
- some functions are tested against the local version. ~~For example, std::round and vector version of round differs. so it was tested against the local version~~
- round was tested against pytorch at::native::round_impl. ~~for double type on **Vsx  vec_round failed  for  (even)+0 .5 values**~~ . it was solved by using vec_rint
- ~~**complex types are not tested**~~  **After enabling complex testing due to precision and domain some of the complex functions failed for vsx and x86 avx as well. I will either test it against local implementation or check within the accepted domain**
- ~~quantizations are not tested~~  Added tests for quantizing, dequantize, requantize_from_int, relu, relu6, widening_subtract functions
- the testing framework should be improved further
- ~~For now `-DBUILD_MOBILE_TEST=ON `will be used for Vec256Test too~~
Vec256 Test cases will be built for each CPU_CAPABILITY

Pull Request resolved: https://github.com/pytorch/pytorch/pull/41541

Reviewed By: zhangguanheng66

Differential Revision: D23922049

Pulled By: VitalyFedyunin

fbshipit-source-id: bca25110afccecbb362cea57c705f3ce02f26098
2020-12-10 13:42:39 -08:00

257 lines
10 KiB
CMake

# This ill-named file does a number of things:
# - Installs Caffe2 header files (this has nothing to do with code generation)
# - Configures caffe2/core/macros.h
# - Creates an ATen target for its generated C++ files and adds it
# as a dependency
# - Reads build lists defined in build_variables.bzl
################################################################################
# Helper functions
################################################################################
function(filter_list output input)
unset(result)
foreach(filename ${${input}})
foreach(pattern ${ARGN})
if("${filename}" MATCHES "${pattern}")
list(APPEND result "${filename}")
endif()
endforeach()
endforeach()
set(${output} ${result} PARENT_SCOPE)
endfunction()
function(filter_list_exclude output input)
unset(result)
foreach(filename ${${input}})
foreach(pattern ${ARGN})
if(NOT "${filename}" MATCHES "${pattern}")
list(APPEND result "${filename}")
endif()
endforeach()
endforeach()
set(${output} ${result} PARENT_SCOPE)
endfunction()
################################################################################
# ---[ Write the macros file
configure_file(
${CMAKE_CURRENT_LIST_DIR}/../caffe2/core/macros.h.in
${CMAKE_BINARY_DIR}/caffe2/core/macros.h)
# ---[ Installing the header files
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
DESTINATION include
FILES_MATCHING PATTERN "*.h")
if(NOT INTERN_BUILD_ATEN_OPS)
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
DESTINATION include/ATen
FILES_MATCHING PATTERN "*.h")
endif()
install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
DESTINATION include/caffe2/core)
# ---[ ATen specific
if(INTERN_BUILD_ATEN_OPS)
if(MSVC)
set(OPT_FLAG "/fp:strict ")
else(MSVC)
set(OPT_FLAG "-O3 ")
if("${CMAKE_BUILD_TYPE}" MATCHES "Debug")
set(OPT_FLAG " ")
endif()
endif(MSVC)
if(C_AVX_FOUND)
if(MSVC)
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG}/arch:AVX ${CXX_AVX_FLAGS}")
else(MSVC)
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG} ${CXX_AVX_FLAGS}")
endif(MSVC)
endif(C_AVX_FOUND)
if(NOT MSVC AND NOT "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/THAllocator.cpp PROPERTIES COMPILE_FLAGS "-fno-openmp")
endif()
file(GLOB cpu_kernel_cpp_in "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cpu/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/quantized/cpu/kernels/*.cpp")
list(APPEND CPU_CAPABILITY_NAMES "DEFAULT")
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}")
if(CXX_AVX_FOUND)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX_CPU_DEFINITION")
list(APPEND CPU_CAPABILITY_NAMES "AVX")
if(MSVC)
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX")
else(MSVC)
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx")
endif(MSVC)
endif(CXX_AVX_FOUND)
if(CXX_AVX2_FOUND)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
# Some versions of GCC pessimistically split unaligned load and store
# instructions when using the default tuning. This is a bad choice on
# new Intel and AMD processors so we disable it when compiling with AVX2.
# See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top
check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT)
if(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
set(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
endif(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
list(APPEND CPU_CAPABILITY_NAMES "AVX2")
if(MSVC)
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2")
else(MSVC)
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}")
endif(MSVC)
endif(CXX_AVX2_FOUND)
if(CXX_VSX_FOUND)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_VSX_CPU_DEFINITION")
LIST(APPEND CPU_CAPABILITY_NAMES "VSX")
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_VSX_FLAGS}")
endif(CXX_VSX_FOUND)
list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
# The sources list might get reordered later based on the capabilites.
# See NOTE [ Linking AVX and non-AVX files ]
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
foreach(IMPL ${cpu_kernel_cpp_in})
string(REPLACE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/" "" NAME ${IMPL})
list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
set(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp)
configure_file(${IMPL} ${NEW_IMPL} COPYONLY)
set(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp}) # Create list of copies
list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
if(MSVC)
set(EXTRA_FLAGS "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}")
else(MSVC)
set(EXTRA_FLAGS "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}")
endif(MSVC)
# Disable certain warnings for GCC-9.X
if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
if(("${NAME}" STREQUAL "native/cpu/GridSamplerKernel.cpp") AND ("${CPU_CAPABILITY}" STREQUAL "DEFAULT"))
# See https://github.com/pytorch/pytorch/issues/38855
set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-uninitialized")
endif()
if("${NAME}" STREQUAL "native/quantized/cpu/kernels/QuantizedOpKernels.cpp")
# See https://github.com/pytorch/pytorch/issues/38854
set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-deprecated-copy")
endif()
endif()
set_source_files_properties(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${EXTRA_FLAGS}")
endforeach()
endforeach()
list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py")
set(GEN_ROCM_FLAG)
if(USE_ROCM)
set(GEN_ROCM_FLAG --rocm)
endif()
set(CUSTOM_BUILD_FLAGS)
if(INTERN_BUILD_MOBILE)
if(USE_VULKAN)
list(APPEND CUSTOM_BUILD_FLAGS --backend_whitelist CPU QuantizedCPU Vulkan)
else()
list(APPEND CUSTOM_BUILD_FLAGS --backend_whitelist CPU QuantizedCPU)
endif()
endif()
if(SELECTED_OP_LIST)
if(NOT OP_DEPENDENCY)
message(INFO "Use default op dependency graph .yaml file for custom build with dynamic dispatch.")
set(OP_DEPENDENCY ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/default_op_deps.yaml)
endif()
execute_process(
COMMAND
"${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_allowlist.py
--op-dependency "${OP_DEPENDENCY}"
--root-ops "${SELECTED_OP_LIST}"
OUTPUT_VARIABLE OP_REGISTRATION_WHITELIST
)
separate_arguments(OP_REGISTRATION_WHITELIST)
message(STATUS "Custom build with op registration whitelist: ${OP_REGISTRATION_WHITELIST}")
list(APPEND CUSTOM_BUILD_FLAGS
--force_schema_registration
--op_registration_whitelist ${OP_REGISTRATION_WHITELIST})
endif()
set(GEN_COMMAND
"${PYTHON_EXECUTABLE}" -m tools.codegen.gen
--source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
--install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
${GEN_ROCM_FLAG}
${CUSTOM_BUILD_FLAGS}
${GEN_VULKAN_FLAGS}
)
execute_process(
COMMAND ${GEN_COMMAND}
--output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt
RESULT_VARIABLE RETURN_VALUE
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
)
if(NOT RETURN_VALUE EQUAL 0)
message(STATUS ${generated_cpp})
message(FATAL_ERROR "Failed to get generated_cpp list")
endif()
# FIXME: the file/variable name lists cpp, but these list both cpp and .h files
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-core core_generated_cpp)
file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*")
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen)
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core)
add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp} ${core_generated_cpp}
COMMAND ${GEN_COMMAND}
DEPENDS ${all_python} ${all_templates}
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
)
# Generated headers used from a CUDA (.cu) file are
# not tracked correctly in CMake. We make the libATen.so depend explicitly
# on building the generated ATen files to workaround.
add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS ${generated_cpp} ${core_generated_cpp})
add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS ${cuda_generated_cpp})
add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
add_dependencies(ATEN_CPU_FILES_GEN_LIB ATEN_CPU_FILES_GEN_TARGET)
add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
endif()
function(append_filelist name outputvar)
set(_rootdir "${${CMAKE_PROJECT_NAME}_SOURCE_DIR}/")
# configure_file adds its input to the list of CMAKE_RERUN dependencies
configure_file(
${PROJECT_SOURCE_DIR}/tools/build_variables.bzl
${PROJECT_BINARY_DIR}/caffe2/build_variables.bzl)
execute_process(
COMMAND "${PYTHON_EXECUTABLE}" -c
"exec(open('${PROJECT_SOURCE_DIR}/tools/build_variables.bzl').read());print(';'.join(['${_rootdir}' + x for x in ${name}]))"
WORKING_DIRECTORY "${_rootdir}"
RESULT_VARIABLE _retval
OUTPUT_VARIABLE _tempvar)
if(NOT _retval EQUAL 0)
message(FATAL_ERROR "Failed to fetch filelist ${name} from build_variables.bzl")
endif()
string(REPLACE "\n" "" _tempvar "${_tempvar}")
list(APPEND ${outputvar} ${_tempvar})
set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
endfunction()
set(NUM_CPU_CAPABILITY_NAMES ${NUM_CPU_CAPABILITY_NAMES} PARENT_SCOPE)
set(CPU_CAPABILITY_FLAGS ${CPU_CAPABILITY_FLAGS} PARENT_SCOPE)