Get ROCm building again on master (#8343)

Billing of changes:

- New Jenkins script for building on rocm. For now it is a bit hacked together, but we can improve it once CI is running
- New ROCM docker image for nightly HIP, and also some legacy packages that we need temporarily
- New enabled config py2-clang3.8-rocmnightly-ubuntu16.04-build based off of the existing Caffe2 image (not built yet)
- A big pile of cmake fixes, mostly to turn bits on/off when ROCM build is involved
- Switch from hiprng to hcrng
- Apply some patches directly in code, eliminating the patches
- Use __hdiv instead of hdiv, it's more portable
- THCNumerics<T>::gt doesn't work in HIP, so simulate it with sub
- Add a few more overloads HIP needs
- Turn off use of hcc to link (we plan to turn this back on to get tests running)
- Search for hiprand, hiprng, hipblas, hipsparse
- Better Python 2 portability
This commit is contained in:
Jorghi12 2018-06-12 23:05:21 -04:00 committed by Edward Z. Yang
parent 49d6c5f99f
commit 81b92f7515
22 changed files with 187 additions and 132 deletions

View file

@ -23,6 +23,21 @@ gcc --version
# TODO: Don't run this...
pip install -r requirements.txt || true
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
export HCC_AMDGPU_TARGET=gfx900
# TODO: Install pyHIPIFY in the docker image
rm -rf pyHIPIFY || true
git clone https://github.com/ROCm-Developer-Tools/pyHIPIFY.git
chmod a+x pyHIPIFY/*.py
sudo cp -p pyHIPIFY/*.py /opt/rocm/bin
rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true
python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py"
HIPCC_VERBOSE=1 VERBOSE=1 WITH_ROCM=1 python setup.py install
exit
fi
# TODO: Don't install this here
if ! which conda; then
pip install mkl mkl-devel
fi

View file

@ -36,3 +36,4 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
pytorch-docker-build-test
short-perf-test-cpu
short-perf-test-gpu
py2-clang3.8-rocmnightly-ubuntu16.04-build

View file

@ -203,7 +203,7 @@ if(NOT MSVC)
set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
endif()
IF(USE_CUDA)
IF(USE_CUDA AND NOT USE_ROCM)
IF ($ENV{ATEN_STATIC_CUDA})
# CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
# we first have to build a fake lib that links with no device callbacks,
@ -250,6 +250,10 @@ IF(USE_CUDA)
${CUDA_LIBRARIES}
${CUDA_cusparse_LIBRARY}
${CUDA_curand_LIBRARY})
# Set necessary HIPCC/CXX Flags
SET(HIP_HCC_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${CMAKE_CXX_FLAGS}")
ENDIF()
if(CUDNN_FOUND)
@ -271,7 +275,7 @@ ENDIF()
IF(USE_ROCM)
### Link in the ROCm libraries BLAS / RNG.
FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib)
FIND_LIBRARY(HIPRNG_LIBRARY hiprng HINTS ${HIPRNG_PATH}/lib)
FIND_LIBRARY(HIPRNG_LIBRARY hcrng HINTS ${HIPRNG_PATH}/lib)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY})
ENDIF()

View file

@ -103,7 +103,7 @@ static inline __device__ void atomicAdd(half *address, half val) {
do {
assumed = old;
#if CUDA_VERSION < 9000
#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
half hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
hsum = THCNumerics<half>::add(hsum, val);
@ -135,7 +135,7 @@ static inline __device__ void atomicAdd(double *address, double val) {
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
}
#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000)
#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__)
// This needs to be defined for the host side pass
static inline __device__ void atomicAdd(double *address, double val) { }
#endif

View file

@ -4,22 +4,16 @@
#include "THCGeneral.h"
/* We compile with CudaHalfTensor support if we have this: */
#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__)
#define CUDA_HALF_TENSOR 1
#endif
/* For HIP, rely on the half instructions as well.*/
#if defined(__HIP_PLATFORM_HCC__)
#define CUDA_HALF_TENSOR 1
#define CUDA_HALF_INSTRUCTIONS 1
#endif
#ifdef CUDA_HALF_TENSOR
#include <cuda_fp16.h>
#include <stdint.h>
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
#ifndef __cplusplus
typedef __half_raw half;
#endif

View file

@ -37,7 +37,7 @@ struct TensorSigmoidOp<half> {
__device__ __forceinline__ void operator()(half* out, half* in) const {
#ifdef CUDA_HALF_INSTRUCTIONS
half one = ScalarConvert<int, half>::to(1);
*out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
*out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
#else
float fin = __half2float(*in);
*out = __float2half(1.0f / (1.0f + expf(- fin)));
@ -47,7 +47,7 @@ struct TensorSigmoidOp<half> {
__device__ __forceinline__ void operator()(half* v) const {
#ifdef CUDA_HALF_INSTRUCTIONS
half one = ScalarConvert<int, half>::to(1);
*v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
*v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
#else
float fv = __half2float(*v);
*v = __float2half(1.0f / (1.0f + expf(- fv)));

View file

@ -80,7 +80,11 @@ struct ReduceMin {
template <typename T>
struct ReduceMax {
inline __device__ T operator()(T a, T b) const {
#if defined(__HIP_PLATFORM_HCC__)
return (static_cast<int>(THCNumerics<T>::sub(a, b)) > 0 || THCNumerics<T>::isnan(a)) ? a : b;
#else
return (THCNumerics<T>::gt(a, b) || THCNumerics<T>::isnan(a)) ? a : b;
#endif
}
};
@ -117,6 +121,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
buffer[tx] = scalar_cast<AccT>(0);
AccT norm;
#if !defined(__HIP_DEVICE_COMPILE__)
if (THCNumerics<AccT>::eq(value, scalar_cast<AccT, float>(INFINITY))) {
// get norm of axis
for (ptrdiff_t i = tx; i < size; i += step) {
@ -163,6 +168,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
row[i] = scalar_cast<T>(THCNumerics<AccT>::mul(val, norm));
}
}
#endif
}
template <typename T>

View file

@ -170,6 +170,7 @@ __global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
// NOTE: curand_uniform is (0, 1] and we want [a, b)
GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a)
GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)

View file

@ -91,7 +91,7 @@ struct ScalarNegate<half> {
return __float2half(-__half2float(v));
#endif
#else
#if CUDA_VERSION < 9000
#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
half out = v;
#else
__half_raw out = __half_raw(v);
@ -116,7 +116,7 @@ struct ScalarInv<half> {
};
inline bool operator==(half a, half b) {
#if CUDA_VERSION < 9000
#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
return a.x == b.x;
#else
__half_raw araw, braw;
@ -127,7 +127,7 @@ inline bool operator==(half a, half b) {
}
inline bool operator!=(half a, half b) {
#if CUDA_VERSION < 9000
#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
return a.x != b.x;
#else
__half_raw araw, braw;

View file

@ -23,7 +23,7 @@ struct TensorSigmoidOp<half> {
__device__ __forceinline__ void operator()(half* out, half* in) const {
#ifdef CUDA_HALF_INSTRUCTIONS
half one = ScalarConvert<int, half>::to(1);
*out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
*out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
#else
float fin = ScalarConvert<half, float>::to(*in);
*out = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fin)));
@ -33,7 +33,7 @@ struct TensorSigmoidOp<half> {
__device__ __forceinline__ void operator()(half* v) const {
#ifdef CUDA_HALF_INSTRUCTIONS
half one = ScalarConvert<int, half>::to(1);
*v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
*v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
#else
float fv = ScalarConvert<half, float>::to(*v);
*v = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fv)));

View file

@ -42,6 +42,12 @@ if(BUILD_ATEN)
list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
IF(USE_ROCM)
# Set the HIP Variables
set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
ENDIF(USE_ROCM)
endif()
# ---[ Caffe2 build
@ -205,7 +211,7 @@ caffe2_interface_library(caffe2 caffe2_library)
list(APPEND Caffe2_MAIN_LIBS caffe2_library)
# ---[ CUDA library.
if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
if(USE_CUDA)
# A hack to deal with cuda library dependencies and modern CMake: the
# CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
# one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
@ -243,31 +249,46 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
endif()
# ---[ Caffe2 HIP sources.
if(BUILD_CAFFE2)
if(USE_ROCM)
HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
if(USE_ROCM)
HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES})
target_include_directories(
caffe2_hip PUBLIC ${Caffe2_HIP_INCLUDES})
target_include_directories(
caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
IF(BUILD_ATEN)
# Set necessary HIPCC Flags
SET(HIP_HCC_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
SET(Caffe2_HIP_CXX_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${Caffe2_HIP_CXX_FLAGS}")
ENDIF()
target_link_libraries(caffe2_hip PUBLIC caffe2)
target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
# Since the HIP_ADD_LIBRARY is a MACRO, we need to set HIP_HCC_FLAGS prior to calling it.
# Also, Since HIP_INCLUDE_DIRECTORIES is a MACRO, we must call it before HIP_ADD_LIBRARY.
HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
caffe2_interface_library(caffe2_hip caffe2_hip_library)
list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
endif()
target_include_directories(
caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES})
target_include_directories(
caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
IF(BUILD_ATEN)
aten_set_target_props(caffe2_hip)
ENDIF()
target_link_libraries(caffe2_hip PUBLIC caffe2)
target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
# https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine
# To avoid having to do above, keep this commented.
set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
caffe2_interface_library(caffe2_hip caffe2_hip_library)
list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
endif()
# ---[ Check if warnings should be errors.
if ($ENV{WERROR})
target_compile_options(caffe2 PRIVATE -Werror)
if(USE_CUDA OR USE_ROCM)
if(USE_CUDA)
target_compile_options(caffe2_gpu PRIVATE -Werror)
endif()
endif()
@ -315,7 +336,8 @@ if(BUILD_CAFFE2)
# Aten tests should only run when Caffe2 is not built
set(__aten_test_dir "test/aten")
endif()
if(BUILD_ATEN)
# Todo - Set up ATen tests for ROCm in an upcoming PR
if(BUILD_ATEN AND NOT USE_ROCM)
foreach(test_src ${ATen_CPU_TEST_SRCS})
get_filename_component(test_name ${test_src} NAME_WE)
add_executable(${test_name} "${test_src}")

View file

@ -442,8 +442,8 @@ if(USE_CUDA)
endif()
# ---[ HIP
if(BUILD_CAFFE2)
include(cmake/public/LoadHIP.cmake)
if(BUILD_CAFFE2 OR BUILD_ATEN)
include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
if(PYTORCH_FOUND_HIP)
message(INFO "Compiling with HIP for AMD.")
caffe2_update_option(USE_ROCM ON)
@ -452,8 +452,11 @@ if(BUILD_CAFFE2)
set(Caffe2_HIP_INCLUDES
${hip_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${Caffe2_HIP_INCLUDES} ${thrust_INCLUDE_DIRS})
set(Caffe2_HIP_DEPENDENCY_LIBS
${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES})
${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipblas_LIBRARIES})
# Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image)
if(BUILD_ATEN)
set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES} ${hiprng_LIBRARIES})
endif()
# TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES}
list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
roc::rocblas)
@ -475,7 +478,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2)
EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
# Link with HIPCC https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#linking-with-hipcc
SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
# SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
# Show message that we're using ROCm.
MESSAGE(STATUS "ROCM TRUE:")

View file

@ -103,11 +103,15 @@ IF(HIP_FOUND)
set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas)
set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
find_package(rocrand REQUIRED)
find_package(hiprand REQUIRED)
find_package(rocblas REQUIRED)
find_package(miopen REQUIRED)
#find_package(hipblas REQUIRED) There's a bug with the CMake file in the Hipblas package.
#find_package(hipsparse REQUIRED)
# TODO: hip_hcc has an interface include flag "-hc" which is only
# recognizable by hcc, but not gcc and clang. Right now in our
@ -117,6 +121,10 @@ IF(HIP_FOUND)
# TODO: miopen_LIBRARIES should return fullpath to the library file,
# however currently it's just the lib name
FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib)
FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib)
FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
set(thrust_INCLUDE_DIRS ${THRUST_PATH} ${THRUST_PATH}/thrust/system/cuda/detail/cub-hip)

View file

@ -34,8 +34,10 @@ if [[ "$image" == *cuda* ]]; then
DOCKERFILE="${OS}-cuda/Dockerfile"
fi
# TODO: the version number here actually doesn't do anything at the
# moment
if [[ "$image" == *rocm* ]]; then
ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+.\d+)/ && print $1')"
ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+\.\d+|nightly)/ && print $1')"
DOCKERFILE="${OS}-rocm/Dockerfile"
fi

View file

@ -2,8 +2,25 @@
set -ex
# TODO: This script should install a SPECIFIC ROCM_VERSION, but actually
# it ignores all values of ROCM_VERSION which are not nightly. Ugh!
[ -n "$ROCM_VERSION" ]
install_hip_nightly() {
git clone https://github.com/ROCm-Developer-Tools/HIP.git
pushd HIP
export HIP_PLATFORM=hcc
yes | ./install.sh --install
popd
rm -rf HIP
git clone https://github.com/ROCmSoftwarePlatform/hipBLAS.git
pushd hipBLAS
yes | ./install.sh --install
popd
rm -rf hipBLAS
}
install_ubuntu() {
apt-get update
apt-get install -y wget
@ -36,11 +53,28 @@ install_centos() {
install_hip_thrust() {
# Needed for now, will be replaced soon
git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust
pushd /data/Thrust
curl https://patch-diff.githubusercontent.com/raw/ROCmSoftwarePlatform/Thrust/pull/12.patch -o 12.patch
patch -p1 < 12.patch
rm 12.patch
popd
rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip
git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip
cd /data/Thrust/thrust/system/cuda/detail/cub-hip && git checkout hip_port_1.7.4_caffe2 && cd -
}
install_hcrng() {
mkdir -p /opt/rocm/debians
curl https://s3.amazonaws.com/ossci-linux/hcrng-master-a8c6a0b-Linux.deb -o /opt/rocm/debians/hcrng.deb
dpkg -i /opt/rocm/debians/hcrng.deb
}
install_hcsparse() {
mkdir -p /opt/rocm/debians
curl https://s3.amazonaws.com/ossci-linux/hcsparse-master-907a505-Linux.deb -o /opt/rocm/debians/hcsparse.deb
dpkg -i /opt/rocm/debians/hcsparse.deb
}
# Install Python packages depending on the base OS
if [ -f /etc/lsb-release ]; then
@ -52,4 +86,12 @@ else
exit 1
fi
# NB: We first install the "wrong" version, but then use those dev tools
# to install the newer version of HIP.
if [ "$ROCM_VERSION" = "nightly" ]; then
install_hip_nightly
fi
install_hip_thrust
install_hcrng
install_hcsparse

View file

@ -104,6 +104,7 @@ import importlib
from tools.setup_helpers.env import check_env_flag
from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME, CUDA_VERSION
from tools.setup_helpers.rocm import WITH_ROCM, ROCM_HOME, ROCM_VERSION
from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY,
CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \
@ -126,7 +127,6 @@ DEBUG = check_env_flag('DEBUG')
IS_WINDOWS = (platform.system() == 'Windows')
IS_DARWIN = (platform.system() == 'Darwin')
IS_LINUX = (platform.system() == 'Linux')
WITH_ROCM = check_env_flag('WITH_ROCM')
FULL_CAFFE2 = check_env_flag('FULL_CAFFE2')
BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
@ -631,8 +631,10 @@ library_dirs.append(lib_path)
# we specify exact lib names to avoid conflict with lua-torch installs
CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.so')]
if WITH_CUDA or WITH_ROCM:
if WITH_CUDA:
CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_gpu.so'), '-Wl,--as-needed'])
if WITH_ROCM:
CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed'])
THD_LIB = os.path.join(lib_path, 'libTHD.a')
NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
C10D_LIB = os.path.join(lib_path, 'libc10d.a')
@ -643,14 +645,18 @@ NANOPB_STATIC_LIB = os.path.join(lib_path, 'libprotobuf-nanopb.a')
if IS_DARWIN:
CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.dylib')]
if WITH_CUDA or WITH_ROCM:
if WITH_CUDA:
CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib'))
if WITH_ROCM:
CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib'))
NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')
if IS_WINDOWS:
CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')]
if WITH_CUDA or WITH_ROCM:
if WITH_CUDA:
CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib'))
if WITH_ROCM:
CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib'))
if DEBUG:
NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopbd.lib')
else:

View file

@ -2,12 +2,12 @@
import shutil
import subprocess
import os
import sys
from shutil import copytree, ignore_patterns
from functools import reduce
amd_build_dir = os.path.dirname(os.path.realpath(__file__))
proj_dir = os.path.dirname(os.path.dirname(amd_build_dir))
out_dir = os.path.join(os.path.dirname(proj_dir), "pytorch_amd")
include_dirs = [
"aten",
"torch"
@ -16,16 +16,13 @@ include_dirs = [
# List of operators currently disabled
yaml_file = os.path.join(amd_build_dir, "disabled_features.yaml")
# Create the pytorch_amd directory
shutil.copytree(proj_dir, out_dir)
# Apply patch files.
patch_folder = os.path.join(amd_build_dir, "patches")
for filename in os.listdir(os.path.join(amd_build_dir, "patches")):
subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=out_dir)
subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=proj_dir)
# HIPCC Compiler doesn't provide host defines - Automatically include them.
for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")):
for root, _, files in os.walk(os.path.join(proj_dir, "aten/src/ATen")):
for filename in files:
if filename.endswith(".cu") or filename.endswith(".cuh"):
filepath = os.path.join(root, filename)
@ -45,7 +42,7 @@ for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")):
# Make various replacements inside AMD_BUILD/torch directory
ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp",
"csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"]
for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
for filename in files:
if filename.endswith(".cpp") or filename.endswith(".h"):
source = os.path.join(root, filename)
@ -53,7 +50,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
continue
# Update contents.
with open(source, "r+", encoding="utf-8") as f:
with open(source, "r+") as f:
contents = f.read()
contents = contents.replace("WITH_CUDA", "WITH_ROCM")
contents = contents.replace("CUDA_VERSION", "0")
@ -64,9 +61,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
os.fsync(f)
# Execute the Hipify Script.
subprocess.Popen(
["/opt/rocm/bin/hipify-python.py",
"--project-directory", proj_dir,
"--output-directory", out_dir,
"--include-dirs"] + include_dirs +
["--yaml-settings", yaml_file, "--add-static-casts", "True"])
args = ["--project-directory", proj_dir,
"--output-directory", proj_dir,
"--include-dirs"] + include_dirs + ["--yaml-settings", yaml_file, "--add-static-casts", "True"]
os.execv("/opt/rocm/bin/hipify-python.py", ['python'] + args)

View file

@ -1,5 +1,5 @@
{
"disabled_hip_function_calls":
"disable_unsupported_hip_calls":
[
{
"path": "aten/src/THC/generic/THCTensorSort.cu",
@ -72,11 +72,16 @@
"#include <nvfunctional>": ""
}
},
{
"path": "aten/src/ATen/native/cuda/Distributions.cu",
"s_constants": {
"#include <nvfunctional>": ""
}
},
{
"path": "aten/src/THC/THCNumerics.cuh",
"s_constants": {
"#ifdef __CUDA_ARCH__": "#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)",
"labs": "fabs",
"#if CUDA_VERSION < 9000": "#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)"
}
},
@ -129,10 +134,16 @@
"disabled_modules": [
"aten/src/ATen/native/cuda/CuFFTUtils.h",
"aten/src/ATen/native/cuda/SpectralOps.cu",
"aten/src/ATen/native/cuda/Distributions.cu",
"aten/src/THCUNN/RReLU.cu"
"aten/src/THCUNN/RReLU.cu",
"aten/src/ATen/native/cuda/Distributions.cu"
],
"disabled_functions": [
{
"path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh",
"functions": [
"kernelPointwiseApply4"
]
},
{
"path": "aten/src/ATen/cuda/detail/IndexUtils.cu",
"non_device_functions": [
@ -148,7 +159,9 @@
{
"path": "aten/src/ATen/native/cuda/Distributions.cu",
"functions": [
"_s_poisson_cuda"
"_s_poisson_cuda",
"poisson_cuda_kernel",
"gamma_cuda_kernel"
]
},
{

View file

@ -1,23 +0,0 @@
diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
index 2653fed0b..c4e9089e0 100644
--- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
@@ -19,13 +19,17 @@ inline __host__ __device__ float fmaxType(float x, half y) {
}
#endif
+/* In ROCm we have a conversion from half to __fp16, and then there's a
+conversion operator from __fp16 to double (w/ the standard conversion
+double to float), so comment out these two lines to prevent ambiguous calls
+for fmaxType when half is passed in.
inline __host__ __device__ float fmaxType(float x, float y) {
return fmaxf(x, y);
}
inline __host__ __device__ double fmaxType(double x, double y) {
return fmax(x, y);
-}
+}*/
#ifdef CUDA_HALF_TENSOR

View file

@ -1,39 +0,0 @@
diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh
index ca6bf7cbe..a523648f1 100644
--- a/aten/src/THC/THCTensorMathReduce.cuh
+++ b/aten/src/THC/THCTensorMathReduce.cuh
@@ -105,7 +105,7 @@ struct SquareFunctor<ResT, half> {
template <typename T>
struct ReduceMin {
inline __device__ T operator()(T a, T b) const {
- return (THCNumerics<T>::lt(a, b) ||
+ return ((int)THCNumerics<T>::sub(a, b) < 0 ||
THCNumerics<T>::isnan(a)) ? a : b;
}
};
@@ -113,7 +113,7 @@ struct ReduceMin {
template <typename T>
struct ReduceMax {
inline __device__ T operator()(T a, T b) const {
- return (THCNumerics<T>::gt(a, b) ||
+ return ((int)THCNumerics<T>::sub(a, b) > 0 ||
THCNumerics<T>::isnan(a)) ? a : b;
}
};
@@ -167,7 +167,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
buffer[tx] = ScalarConvert<int, Real>::to(0);
Real norm;
-
+ #if !defined(__HIP_DEVICE_COMPILE__)
if (THCNumerics<Real>::eq(value, ScalarConvert<float, Real>::to(INFINITY))) {
// get norm of axis
for (ptrdiff_t i=tx; i<size; i+=step)
@@ -225,6 +225,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
row[i] = THCNumerics<Real>::mul(row[i], norm);
}
}
+ #endif
}
template <typename T>

View file

@ -1,5 +1,5 @@
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
index 906780b4f..c99f156ab 100644
index 906780b4f..b03e051cb 100644
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@@ -504,11 +504,11 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_
@ -7,7 +7,7 @@ index 906780b4f..c99f156ab 100644
if (range > 1ULL << 32) {
generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
- gen->state.gen_states, size, data, min_val, range);
+ gen->state.gen_states, static_cast<int>(size), data, static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(min_val), static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(range));
+ gen->state.gen_states, static_cast<int>(size), data, min_val, range);
} else {
#endif
generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@ -17,7 +17,7 @@ index 906780b4f..c99f156ab 100644
}
#endif
@@ -534,19 +534,19 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
#if defined(THC_REAL_IS_HALF)
generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
- gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1);
@ -39,5 +39,5 @@ index 906780b4f..c99f156ab 100644
- gen->state.gen_states, size, data, 0UL, static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
+ gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(0UL), static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
#endif
THCTensor_(freeCopyTo)(state, self, self_);

View file

@ -0,0 +1,5 @@
from .env import check_env_flag
# Check if ROCM is enabled
WITH_ROCM = check_env_flag('WITH_ROCM')
ROCM_HOME = "/opt/rocm"
ROCM_VERSION = ""