diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index c758b60a577..4d0d34db920 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -23,6 +23,21 @@ gcc --version # TODO: Don't run this... pip install -r requirements.txt || true +if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then + export HCC_AMDGPU_TARGET=gfx900 + + # TODO: Install pyHIPIFY in the docker image + rm -rf pyHIPIFY || true + git clone https://github.com/ROCm-Developer-Tools/pyHIPIFY.git + chmod a+x pyHIPIFY/*.py + sudo cp -p pyHIPIFY/*.py /opt/rocm/bin + rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true + python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py" + HIPCC_VERBOSE=1 VERBOSE=1 WITH_ROCM=1 python setup.py install + exit +fi + +# TODO: Don't install this here if ! which conda; then pip install mkl mkl-devel fi diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt index 763e95748b7..888e11e27bc 100644 --- a/.jenkins/pytorch/enabled-configs.txt +++ b/.jenkins/pytorch/enabled-configs.txt @@ -36,3 +36,4 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build pytorch-docker-build-test short-perf-test-cpu short-perf-test-gpu +py2-clang3.8-rocmnightly-ubuntu16.04-build diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index f5ce8080c60..38de24ebca0 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -203,7 +203,7 @@ if(NOT MSVC) set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) endif() -IF(USE_CUDA) +IF(USE_CUDA AND NOT USE_ROCM) IF ($ENV{ATEN_STATIC_CUDA}) # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support # we first have to build a fake lib that links with no device callbacks, @@ -250,6 +250,10 @@ IF(USE_CUDA) ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_curand_LIBRARY}) + + # Set necessary HIPCC/CXX Flags + SET(HIP_HCC_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}") + SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${CMAKE_CXX_FLAGS}") ENDIF() if(CUDNN_FOUND) @@ -271,7 +275,7 @@ ENDIF() IF(USE_ROCM) ### Link in the ROCm libraries BLAS / RNG. FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib) - FIND_LIBRARY(HIPRNG_LIBRARY hiprng HINTS ${HIPRNG_PATH}/lib) + FIND_LIBRARY(HIPRNG_LIBRARY hcrng HINTS ${HIPRNG_PATH}/lib) list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY}) ENDIF() diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh index 9e54c56dc45..e89bf424e30 100644 --- a/aten/src/THC/THCAtomics.cuh +++ b/aten/src/THC/THCAtomics.cuh @@ -103,7 +103,7 @@ static inline __device__ void atomicAdd(half *address, half val) { do { assumed = old; -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) half hsum; hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); hsum = THCNumerics::add(hsum, val); @@ -135,7 +135,7 @@ static inline __device__ void atomicAdd(double *address, double val) { // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); } -#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) +#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__) // This needs to be defined for the host side pass static inline __device__ void atomicAdd(double *address, double val) { } #endif diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h index 43ffd76e316..d9b8cba72d2 100644 --- a/aten/src/THC/THCHalf.h +++ b/aten/src/THC/THCHalf.h @@ -4,22 +4,16 @@ #include "THCGeneral.h" /* We compile with CudaHalfTensor support if we have this: */ -#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 +#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__) #define CUDA_HALF_TENSOR 1 #endif -/* For HIP, rely on the half instructions as well.*/ -#if defined(__HIP_PLATFORM_HCC__) -#define CUDA_HALF_TENSOR 1 -#define CUDA_HALF_INSTRUCTIONS 1 -#endif - #ifdef CUDA_HALF_TENSOR #include #include -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) #ifndef __cplusplus typedef __half_raw half; #endif diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh index 67dfaf06a43..26389c3a990 100644 --- a/aten/src/THC/THCTensorMathPointwise.cuh +++ b/aten/src/THC/THCTensorMathPointwise.cuh @@ -37,7 +37,7 @@ struct TensorSigmoidOp { __device__ __forceinline__ void operator()(half* out, half* in) const { #ifdef CUDA_HALF_INSTRUCTIONS half one = ScalarConvert::to(1); - *out = hdiv(one, __hadd(one, hexp(__hneg(*in)))); + *out = __hdiv(one, __hadd(one, hexp(__hneg(*in)))); #else float fin = __half2float(*in); *out = __float2half(1.0f / (1.0f + expf(- fin))); @@ -47,7 +47,7 @@ struct TensorSigmoidOp { __device__ __forceinline__ void operator()(half* v) const { #ifdef CUDA_HALF_INSTRUCTIONS half one = ScalarConvert::to(1); - *v = hdiv(one, __hadd(one, hexp(__hneg(*v)))); + *v = __hdiv(one, __hadd(one, hexp(__hneg(*v)))); #else float fv = __half2float(*v); *v = __float2half(1.0f / (1.0f + expf(- fv))); diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh index 1002c4a926c..d3f290de1e3 100644 --- a/aten/src/THC/THCTensorMathReduce.cuh +++ b/aten/src/THC/THCTensorMathReduce.cuh @@ -80,7 +80,11 @@ struct ReduceMin { template struct ReduceMax { inline __device__ T operator()(T a, T b) const { +#if defined(__HIP_PLATFORM_HCC__) + return (static_cast(THCNumerics::sub(a, b)) > 0 || THCNumerics::isnan(a)) ? a : b; +#else return (THCNumerics::gt(a, b) || THCNumerics::isnan(a)) ? a : b; +#endif } }; @@ -117,6 +121,7 @@ __global__ void THCTensor_kernel_renorm(T *data, buffer[tx] = scalar_cast(0); AccT norm; +#if !defined(__HIP_DEVICE_COMPILE__) if (THCNumerics::eq(value, scalar_cast(INFINITY))) { // get norm of axis for (ptrdiff_t i = tx; i < size; i += step) { @@ -163,6 +168,7 @@ __global__ void THCTensor_kernel_renorm(T *data, row[i] = scalar_cast(THCNumerics::mul(val, norm)); } } +#endif } template diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index 432138493dd..d05af2dbf59 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -170,6 +170,7 @@ __global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size, // NOTE: curand_uniform is (0, 1] and we want [a, b) GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) +GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a) GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean) diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh index 3b52a02e763..99266b3b4b9 100644 --- a/aten/src/THC/THCTensorTypeUtils.cuh +++ b/aten/src/THC/THCTensorTypeUtils.cuh @@ -91,7 +91,7 @@ struct ScalarNegate { return __float2half(-__half2float(v)); #endif #else -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) half out = v; #else __half_raw out = __half_raw(v); @@ -116,7 +116,7 @@ struct ScalarInv { }; inline bool operator==(half a, half b) { -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) return a.x == b.x; #else __half_raw araw, braw; @@ -127,7 +127,7 @@ inline bool operator==(half a, half b) { } inline bool operator!=(half a, half b) { -#if CUDA_VERSION < 9000 +#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) return a.x != b.x; #else __half_raw araw, braw; diff --git a/aten/src/THCUNN/FusedRNNKernel.cu b/aten/src/THCUNN/FusedRNNKernel.cu index 58e22f8cfb0..d8b594ab046 100644 --- a/aten/src/THCUNN/FusedRNNKernel.cu +++ b/aten/src/THCUNN/FusedRNNKernel.cu @@ -23,7 +23,7 @@ struct TensorSigmoidOp { __device__ __forceinline__ void operator()(half* out, half* in) const { #ifdef CUDA_HALF_INSTRUCTIONS half one = ScalarConvert::to(1); - *out = hdiv(one, __hadd(one, hexp(__hneg(*in)))); + *out = __hdiv(one, __hadd(one, hexp(__hneg(*in)))); #else float fin = ScalarConvert::to(*in); *out = ScalarConvert::to(1.0f / (1.0f + expf(- fin))); @@ -33,7 +33,7 @@ struct TensorSigmoidOp { __device__ __forceinline__ void operator()(half* v) const { #ifdef CUDA_HALF_INSTRUCTIONS half one = ScalarConvert::to(1); - *v = hdiv(one, __hadd(one, hexp(__hneg(*v)))); + *v = __hdiv(one, __hadd(one, hexp(__hneg(*v)))); #else float fv = ScalarConvert::to(*v); *v = ScalarConvert::to(1.0f / (1.0f + expf(- fv))); diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index cc21ed93a19..722e6bf4311 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -42,6 +42,12 @@ if(BUILD_ATEN) list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS}) list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS}) list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE}) + + IF(USE_ROCM) + # Set the HIP Variables + set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS}) + set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE}) + ENDIF(USE_ROCM) endif() # ---[ Caffe2 build @@ -205,7 +211,7 @@ caffe2_interface_library(caffe2 caffe2_library) list(APPEND Caffe2_MAIN_LIBS caffe2_library) # ---[ CUDA library. -if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2)) +if(USE_CUDA) # A hack to deal with cuda library dependencies and modern CMake: the # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result, # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This @@ -243,31 +249,46 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2)) endif() # ---[ Caffe2 HIP sources. -if(BUILD_CAFFE2) - if(USE_ROCM) - HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS}) - set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS}) +if(USE_ROCM) + HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES}) - target_include_directories( - caffe2_hip PUBLIC ${Caffe2_HIP_INCLUDES}) - target_include_directories( - caffe2_hip INTERFACE $) + IF(BUILD_ATEN) + # Set necessary HIPCC Flags + SET(HIP_HCC_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}") + SET(Caffe2_HIP_CXX_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${Caffe2_HIP_CXX_FLAGS}") + ENDIF() - target_link_libraries(caffe2_hip PUBLIC caffe2) - target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS}) + # Since the HIP_ADD_LIBRARY is a MACRO, we need to set HIP_HCC_FLAGS prior to calling it. + # Also, Since HIP_INCLUDE_DIRECTORIES is a MACRO, we must call it before HIP_ADD_LIBRARY. + HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS}) - set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP) + set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS}) - caffe2_interface_library(caffe2_hip caffe2_hip_library) - list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library) - install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib) - endif() + target_include_directories( + caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES}) + target_include_directories( + caffe2_hip INTERFACE $) + + IF(BUILD_ATEN) + aten_set_target_props(caffe2_hip) + ENDIF() + + target_link_libraries(caffe2_hip PUBLIC caffe2) + target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS}) + + # https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine + # To avoid having to do above, keep this commented. + set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP) + + caffe2_interface_library(caffe2_hip caffe2_hip_library) + list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library) + install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib) endif() # ---[ Check if warnings should be errors. if ($ENV{WERROR}) target_compile_options(caffe2 PRIVATE -Werror) - if(USE_CUDA OR USE_ROCM) + if(USE_CUDA) target_compile_options(caffe2_gpu PRIVATE -Werror) endif() endif() @@ -315,7 +336,8 @@ if(BUILD_CAFFE2) # Aten tests should only run when Caffe2 is not built set(__aten_test_dir "test/aten") endif() -if(BUILD_ATEN) +# Todo - Set up ATen tests for ROCm in an upcoming PR +if(BUILD_ATEN AND NOT USE_ROCM) foreach(test_src ${ATen_CPU_TEST_SRCS}) get_filename_component(test_name ${test_src} NAME_WE) add_executable(${test_name} "${test_src}") diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index f722841db07..3846ff30904 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -442,8 +442,8 @@ if(USE_CUDA) endif() # ---[ HIP -if(BUILD_CAFFE2) - include(cmake/public/LoadHIP.cmake) +if(BUILD_CAFFE2 OR BUILD_ATEN) + include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake) if(PYTORCH_FOUND_HIP) message(INFO "Compiling with HIP for AMD.") caffe2_update_option(USE_ROCM ON) @@ -452,8 +452,11 @@ if(BUILD_CAFFE2) set(Caffe2_HIP_INCLUDES ${hip_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${Caffe2_HIP_INCLUDES} ${thrust_INCLUDE_DIRS}) set(Caffe2_HIP_DEPENDENCY_LIBS - ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES}) - + ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipblas_LIBRARIES}) + # Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image) + if(BUILD_ATEN) + set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES} ${hiprng_LIBRARIES}) + endif() # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES} list(APPEND Caffe2_HIP_DEPENDENCY_LIBS roc::rocblas) @@ -475,7 +478,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2) EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) # Link with HIPCC https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#linking-with-hipcc - SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE}) + # SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE}) # Show message that we're using ROCm. MESSAGE(STATUS "ROCM TRUE:") diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index 7d813cc22de..c6b98143ddc 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -103,11 +103,15 @@ IF(HIP_FOUND) set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand) set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen) + set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas) + set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) find_package(rocrand REQUIRED) find_package(hiprand REQUIRED) find_package(rocblas REQUIRED) find_package(miopen REQUIRED) + #find_package(hipblas REQUIRED) There's a bug with the CMake file in the Hipblas package. + #find_package(hipsparse REQUIRED) # TODO: hip_hcc has an interface include flag "-hc" which is only # recognizable by hcc, but not gcc and clang. Right now in our @@ -117,6 +121,10 @@ IF(HIP_FOUND) # TODO: miopen_LIBRARIES should return fullpath to the library file, # however currently it's just the lib name FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib) + FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib) + FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib) + FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib) + FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib) set(thrust_INCLUDE_DIRS ${THRUST_PATH} ${THRUST_PATH}/thrust/system/cuda/detail/cub-hip) diff --git a/docker/caffe2/jenkins/build.sh b/docker/caffe2/jenkins/build.sh index 7531136c1b9..357a79549da 100755 --- a/docker/caffe2/jenkins/build.sh +++ b/docker/caffe2/jenkins/build.sh @@ -34,8 +34,10 @@ if [[ "$image" == *cuda* ]]; then DOCKERFILE="${OS}-cuda/Dockerfile" fi +# TODO: the version number here actually doesn't do anything at the +# moment if [[ "$image" == *rocm* ]]; then - ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+.\d+)/ && print $1')" + ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+\.\d+|nightly)/ && print $1')" DOCKERFILE="${OS}-rocm/Dockerfile" fi diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh index 5376e2701a7..0dbc382a980 100644 --- a/docker/caffe2/jenkins/common/install_rocm.sh +++ b/docker/caffe2/jenkins/common/install_rocm.sh @@ -2,8 +2,25 @@ set -ex +# TODO: This script should install a SPECIFIC ROCM_VERSION, but actually +# it ignores all values of ROCM_VERSION which are not nightly. Ugh! [ -n "$ROCM_VERSION" ] +install_hip_nightly() { + git clone https://github.com/ROCm-Developer-Tools/HIP.git + pushd HIP + export HIP_PLATFORM=hcc + yes | ./install.sh --install + popd + rm -rf HIP + + git clone https://github.com/ROCmSoftwarePlatform/hipBLAS.git + pushd hipBLAS + yes | ./install.sh --install + popd + rm -rf hipBLAS +} + install_ubuntu() { apt-get update apt-get install -y wget @@ -36,11 +53,28 @@ install_centos() { install_hip_thrust() { # Needed for now, will be replaced soon git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust + pushd /data/Thrust + curl https://patch-diff.githubusercontent.com/raw/ROCmSoftwarePlatform/Thrust/pull/12.patch -o 12.patch + patch -p1 < 12.patch + rm 12.patch + popd rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip cd /data/Thrust/thrust/system/cuda/detail/cub-hip && git checkout hip_port_1.7.4_caffe2 && cd - } +install_hcrng() { + mkdir -p /opt/rocm/debians + curl https://s3.amazonaws.com/ossci-linux/hcrng-master-a8c6a0b-Linux.deb -o /opt/rocm/debians/hcrng.deb + dpkg -i /opt/rocm/debians/hcrng.deb +} + +install_hcsparse() { + mkdir -p /opt/rocm/debians + curl https://s3.amazonaws.com/ossci-linux/hcsparse-master-907a505-Linux.deb -o /opt/rocm/debians/hcsparse.deb + dpkg -i /opt/rocm/debians/hcsparse.deb +} + # Install Python packages depending on the base OS if [ -f /etc/lsb-release ]; then @@ -52,4 +86,12 @@ else exit 1 fi +# NB: We first install the "wrong" version, but then use those dev tools +# to install the newer version of HIP. +if [ "$ROCM_VERSION" = "nightly" ]; then + install_hip_nightly +fi + install_hip_thrust +install_hcrng +install_hcsparse diff --git a/setup.py b/setup.py index 00168cd656b..92356404723 100644 --- a/setup.py +++ b/setup.py @@ -104,6 +104,7 @@ import importlib from tools.setup_helpers.env import check_env_flag from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME, CUDA_VERSION +from tools.setup_helpers.rocm import WITH_ROCM, ROCM_HOME, ROCM_VERSION from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY, CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR) from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \ @@ -126,7 +127,6 @@ DEBUG = check_env_flag('DEBUG') IS_WINDOWS = (platform.system() == 'Windows') IS_DARWIN = (platform.system() == 'Darwin') IS_LINUX = (platform.system() == 'Linux') -WITH_ROCM = check_env_flag('WITH_ROCM') FULL_CAFFE2 = check_env_flag('FULL_CAFFE2') BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH') @@ -631,8 +631,10 @@ library_dirs.append(lib_path) # we specify exact lib names to avoid conflict with lua-torch installs CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.so')] -if WITH_CUDA or WITH_ROCM: +if WITH_CUDA: CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_gpu.so'), '-Wl,--as-needed']) +if WITH_ROCM: + CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed']) THD_LIB = os.path.join(lib_path, 'libTHD.a') NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1') C10D_LIB = os.path.join(lib_path, 'libc10d.a') @@ -643,14 +645,18 @@ NANOPB_STATIC_LIB = os.path.join(lib_path, 'libprotobuf-nanopb.a') if IS_DARWIN: CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.dylib')] - if WITH_CUDA or WITH_ROCM: + if WITH_CUDA: CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib')) + if WITH_ROCM: + CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib')) NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib') if IS_WINDOWS: CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')] - if WITH_CUDA or WITH_ROCM: + if WITH_CUDA: CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib')) + if WITH_ROCM: + CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib')) if DEBUG: NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopbd.lib') else: diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py index 315220ec4c7..a6ca1ae1b06 100644 --- a/tools/amd_build/build_pytorch_amd.py +++ b/tools/amd_build/build_pytorch_amd.py @@ -2,12 +2,12 @@ import shutil import subprocess import os +import sys from shutil import copytree, ignore_patterns from functools import reduce amd_build_dir = os.path.dirname(os.path.realpath(__file__)) proj_dir = os.path.dirname(os.path.dirname(amd_build_dir)) -out_dir = os.path.join(os.path.dirname(proj_dir), "pytorch_amd") include_dirs = [ "aten", "torch" @@ -16,16 +16,13 @@ include_dirs = [ # List of operators currently disabled yaml_file = os.path.join(amd_build_dir, "disabled_features.yaml") -# Create the pytorch_amd directory -shutil.copytree(proj_dir, out_dir) - # Apply patch files. patch_folder = os.path.join(amd_build_dir, "patches") for filename in os.listdir(os.path.join(amd_build_dir, "patches")): - subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=out_dir) + subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=proj_dir) # HIPCC Compiler doesn't provide host defines - Automatically include them. -for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")): +for root, _, files in os.walk(os.path.join(proj_dir, "aten/src/ATen")): for filename in files: if filename.endswith(".cu") or filename.endswith(".cuh"): filepath = os.path.join(root, filename) @@ -45,7 +42,7 @@ for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")): # Make various replacements inside AMD_BUILD/torch directory ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp", "csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"] -for root, _directories, files in os.walk(os.path.join(out_dir, "torch")): +for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")): for filename in files: if filename.endswith(".cpp") or filename.endswith(".h"): source = os.path.join(root, filename) @@ -53,7 +50,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")): if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False): continue # Update contents. - with open(source, "r+", encoding="utf-8") as f: + with open(source, "r+") as f: contents = f.read() contents = contents.replace("WITH_CUDA", "WITH_ROCM") contents = contents.replace("CUDA_VERSION", "0") @@ -64,9 +61,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")): os.fsync(f) # Execute the Hipify Script. -subprocess.Popen( - ["/opt/rocm/bin/hipify-python.py", - "--project-directory", proj_dir, - "--output-directory", out_dir, - "--include-dirs"] + include_dirs + - ["--yaml-settings", yaml_file, "--add-static-casts", "True"]) +args = ["--project-directory", proj_dir, + "--output-directory", proj_dir, + "--include-dirs"] + include_dirs + ["--yaml-settings", yaml_file, "--add-static-casts", "True"] +os.execv("/opt/rocm/bin/hipify-python.py", ['python'] + args) diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml index 60fd665e08d..ca82ab926c6 100644 --- a/tools/amd_build/disabled_features.yaml +++ b/tools/amd_build/disabled_features.yaml @@ -1,5 +1,5 @@ { - "disabled_hip_function_calls": + "disable_unsupported_hip_calls": [ { "path": "aten/src/THC/generic/THCTensorSort.cu", @@ -72,11 +72,16 @@ "#include ": "" } }, + { + "path": "aten/src/ATen/native/cuda/Distributions.cu", + "s_constants": { + "#include ": "" + } + }, { "path": "aten/src/THC/THCNumerics.cuh", "s_constants": { "#ifdef __CUDA_ARCH__": "#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)", - "labs": "fabs", "#if CUDA_VERSION < 9000": "#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)" } }, @@ -129,10 +134,16 @@ "disabled_modules": [ "aten/src/ATen/native/cuda/CuFFTUtils.h", "aten/src/ATen/native/cuda/SpectralOps.cu", - "aten/src/ATen/native/cuda/Distributions.cu", - "aten/src/THCUNN/RReLU.cu" + "aten/src/THCUNN/RReLU.cu", + "aten/src/ATen/native/cuda/Distributions.cu" ], "disabled_functions": [ + { + "path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh", + "functions": [ + "kernelPointwiseApply4" + ] + }, { "path": "aten/src/ATen/cuda/detail/IndexUtils.cu", "non_device_functions": [ @@ -148,7 +159,9 @@ { "path": "aten/src/ATen/native/cuda/Distributions.cu", "functions": [ - "_s_poisson_cuda" + "_s_poisson_cuda", + "poisson_cuda_kernel", + "gamma_cuda_kernel" ] }, { diff --git a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch b/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch deleted file mode 100644 index d1bcb76bb58..00000000000 --- a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch +++ /dev/null @@ -1,23 +0,0 @@ -diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh -index 2653fed0b..c4e9089e0 100644 ---- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh -+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh -@@ -19,13 +19,17 @@ inline __host__ __device__ float fmaxType(float x, half y) { - } - #endif - -+/* In ROCm we have a conversion from half to __fp16, and then there's a -+conversion operator from __fp16 to double (w/ the standard conversion -+double to float), so comment out these two lines to prevent ambiguous calls -+for fmaxType when half is passed in. - inline __host__ __device__ float fmaxType(float x, float y) { - return fmaxf(x, y); - } - - inline __host__ __device__ double fmaxType(double x, double y) { - return fmax(x, y); --} -+}*/ - - #ifdef CUDA_HALF_TENSOR - diff --git a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch b/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch deleted file mode 100644 index 72e4160ea4a..00000000000 --- a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch +++ /dev/null @@ -1,39 +0,0 @@ -diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh -index ca6bf7cbe..a523648f1 100644 ---- a/aten/src/THC/THCTensorMathReduce.cuh -+++ b/aten/src/THC/THCTensorMathReduce.cuh -@@ -105,7 +105,7 @@ struct SquareFunctor { - template - struct ReduceMin { - inline __device__ T operator()(T a, T b) const { -- return (THCNumerics::lt(a, b) || -+ return ((int)THCNumerics::sub(a, b) < 0 || - THCNumerics::isnan(a)) ? a : b; - } - }; -@@ -113,7 +113,7 @@ struct ReduceMin { - template - struct ReduceMax { - inline __device__ T operator()(T a, T b) const { -- return (THCNumerics::gt(a, b) || -+ return ((int)THCNumerics::sub(a, b) > 0 || - THCNumerics::isnan(a)) ? a : b; - } - }; -@@ -167,7 +167,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd - - buffer[tx] = ScalarConvert::to(0); - Real norm; -- -+ #if !defined(__HIP_DEVICE_COMPILE__) - if (THCNumerics::eq(value, ScalarConvert::to(INFINITY))) { - // get norm of axis - for (ptrdiff_t i=tx; i::mul(row[i], norm); - } - } -+ #endif - } - - template diff --git a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch index 8fbc7ad7e90..355c1e3a7c3 100644 --- a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch +++ b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch @@ -1,5 +1,5 @@ diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu -index 906780b4f..c99f156ab 100644 +index 906780b4f..b03e051cb 100644 --- a/aten/src/THC/generic/THCTensorRandom.cu +++ b/aten/src/THC/generic/THCTensorRandom.cu @@ -504,11 +504,11 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_ @@ -7,7 +7,7 @@ index 906780b4f..c99f156ab 100644 if (range > 1ULL << 32) { generate_random_64<<>>( - gen->state.gen_states, size, data, min_val, range); -+ gen->state.gen_states, static_cast(size), data, static_cast::value, float, double>::type>(min_val), static_cast::value, float, double>::type>(range)); ++ gen->state.gen_states, static_cast(size), data, min_val, range); } else { #endif generate_random<<>>( @@ -17,7 +17,7 @@ index 906780b4f..c99f156ab 100644 } #endif @@ -534,19 +534,19 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_) - + #if defined(THC_REAL_IS_HALF) generate_random<<>>( - gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1); @@ -39,5 +39,5 @@ index 906780b4f..c99f156ab 100644 - gen->state.gen_states, size, data, 0UL, static_cast(std::numeric_limits::max()) + 1); + gen->state.gen_states, static_cast(size), data, static_cast(0UL), static_cast(std::numeric_limits::max()) + 1); #endif - + THCTensor_(freeCopyTo)(state, self, self_); diff --git a/tools/setup_helpers/rocm.py b/tools/setup_helpers/rocm.py new file mode 100644 index 00000000000..b981dd82dc5 --- /dev/null +++ b/tools/setup_helpers/rocm.py @@ -0,0 +1,5 @@ +from .env import check_env_flag +# Check if ROCM is enabled +WITH_ROCM = check_env_flag('WITH_ROCM') +ROCM_HOME = "/opt/rocm" +ROCM_VERSION = ""