mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Get ROCm building again on master (#8343)
Billing of changes: - New Jenkins script for building on rocm. For now it is a bit hacked together, but we can improve it once CI is running - New ROCM docker image for nightly HIP, and also some legacy packages that we need temporarily - New enabled config py2-clang3.8-rocmnightly-ubuntu16.04-build based off of the existing Caffe2 image (not built yet) - A big pile of cmake fixes, mostly to turn bits on/off when ROCM build is involved - Switch from hiprng to hcrng - Apply some patches directly in code, eliminating the patches - Use __hdiv instead of hdiv, it's more portable - THCNumerics<T>::gt doesn't work in HIP, so simulate it with sub - Add a few more overloads HIP needs - Turn off use of hcc to link (we plan to turn this back on to get tests running) - Search for hiprand, hiprng, hipblas, hipsparse - Better Python 2 portability
This commit is contained in:
parent
49d6c5f99f
commit
81b92f7515
22 changed files with 187 additions and 132 deletions
|
|
@ -23,6 +23,21 @@ gcc --version
|
|||
# TODO: Don't run this...
|
||||
pip install -r requirements.txt || true
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||
export HCC_AMDGPU_TARGET=gfx900
|
||||
|
||||
# TODO: Install pyHIPIFY in the docker image
|
||||
rm -rf pyHIPIFY || true
|
||||
git clone https://github.com/ROCm-Developer-Tools/pyHIPIFY.git
|
||||
chmod a+x pyHIPIFY/*.py
|
||||
sudo cp -p pyHIPIFY/*.py /opt/rocm/bin
|
||||
rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true
|
||||
python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py"
|
||||
HIPCC_VERBOSE=1 VERBOSE=1 WITH_ROCM=1 python setup.py install
|
||||
exit
|
||||
fi
|
||||
|
||||
# TODO: Don't install this here
|
||||
if ! which conda; then
|
||||
pip install mkl mkl-devel
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -36,3 +36,4 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
|
|||
pytorch-docker-build-test
|
||||
short-perf-test-cpu
|
||||
short-perf-test-gpu
|
||||
py2-clang3.8-rocmnightly-ubuntu16.04-build
|
||||
|
|
|
|||
|
|
@ -203,7 +203,7 @@ if(NOT MSVC)
|
|||
set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
|
||||
endif()
|
||||
|
||||
IF(USE_CUDA)
|
||||
IF(USE_CUDA AND NOT USE_ROCM)
|
||||
IF ($ENV{ATEN_STATIC_CUDA})
|
||||
# CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
|
||||
# we first have to build a fake lib that links with no device callbacks,
|
||||
|
|
@ -250,6 +250,10 @@ IF(USE_CUDA)
|
|||
${CUDA_LIBRARIES}
|
||||
${CUDA_cusparse_LIBRARY}
|
||||
${CUDA_curand_LIBRARY})
|
||||
|
||||
# Set necessary HIPCC/CXX Flags
|
||||
SET(HIP_HCC_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
|
||||
SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${CMAKE_CXX_FLAGS}")
|
||||
ENDIF()
|
||||
|
||||
if(CUDNN_FOUND)
|
||||
|
|
@ -271,7 +275,7 @@ ENDIF()
|
|||
IF(USE_ROCM)
|
||||
### Link in the ROCm libraries BLAS / RNG.
|
||||
FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib)
|
||||
FIND_LIBRARY(HIPRNG_LIBRARY hiprng HINTS ${HIPRNG_PATH}/lib)
|
||||
FIND_LIBRARY(HIPRNG_LIBRARY hcrng HINTS ${HIPRNG_PATH}/lib)
|
||||
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY})
|
||||
ENDIF()
|
||||
|
|
|
|||
|
|
@ -103,7 +103,7 @@ static inline __device__ void atomicAdd(half *address, half val) {
|
|||
|
||||
do {
|
||||
assumed = old;
|
||||
#if CUDA_VERSION < 9000
|
||||
#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
|
||||
half hsum;
|
||||
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
|
||||
hsum = THCNumerics<half>::add(hsum, val);
|
||||
|
|
@ -135,7 +135,7 @@ static inline __device__ void atomicAdd(double *address, double val) {
|
|||
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
|
||||
} while (assumed != old);
|
||||
}
|
||||
#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000)
|
||||
#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__)
|
||||
// This needs to be defined for the host side pass
|
||||
static inline __device__ void atomicAdd(double *address, double val) { }
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -4,22 +4,16 @@
|
|||
#include "THCGeneral.h"
|
||||
|
||||
/* We compile with CudaHalfTensor support if we have this: */
|
||||
#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
|
||||
#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__)
|
||||
#define CUDA_HALF_TENSOR 1
|
||||
#endif
|
||||
|
||||
/* For HIP, rely on the half instructions as well.*/
|
||||
#if defined(__HIP_PLATFORM_HCC__)
|
||||
#define CUDA_HALF_TENSOR 1
|
||||
#define CUDA_HALF_INSTRUCTIONS 1
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_HALF_TENSOR
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if CUDA_VERSION >= 9000
|
||||
#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
|
||||
#ifndef __cplusplus
|
||||
typedef __half_raw half;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ struct TensorSigmoidOp<half> {
|
|||
__device__ __forceinline__ void operator()(half* out, half* in) const {
|
||||
#ifdef CUDA_HALF_INSTRUCTIONS
|
||||
half one = ScalarConvert<int, half>::to(1);
|
||||
*out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
|
||||
*out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
|
||||
#else
|
||||
float fin = __half2float(*in);
|
||||
*out = __float2half(1.0f / (1.0f + expf(- fin)));
|
||||
|
|
@ -47,7 +47,7 @@ struct TensorSigmoidOp<half> {
|
|||
__device__ __forceinline__ void operator()(half* v) const {
|
||||
#ifdef CUDA_HALF_INSTRUCTIONS
|
||||
half one = ScalarConvert<int, half>::to(1);
|
||||
*v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
|
||||
*v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
|
||||
#else
|
||||
float fv = __half2float(*v);
|
||||
*v = __float2half(1.0f / (1.0f + expf(- fv)));
|
||||
|
|
|
|||
|
|
@ -80,7 +80,11 @@ struct ReduceMin {
|
|||
template <typename T>
|
||||
struct ReduceMax {
|
||||
inline __device__ T operator()(T a, T b) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__)
|
||||
return (static_cast<int>(THCNumerics<T>::sub(a, b)) > 0 || THCNumerics<T>::isnan(a)) ? a : b;
|
||||
#else
|
||||
return (THCNumerics<T>::gt(a, b) || THCNumerics<T>::isnan(a)) ? a : b;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -117,6 +121,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
|
|||
buffer[tx] = scalar_cast<AccT>(0);
|
||||
AccT norm;
|
||||
|
||||
#if !defined(__HIP_DEVICE_COMPILE__)
|
||||
if (THCNumerics<AccT>::eq(value, scalar_cast<AccT, float>(INFINITY))) {
|
||||
// get norm of axis
|
||||
for (ptrdiff_t i = tx; i < size; i += step) {
|
||||
|
|
@ -163,6 +168,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
|
|||
row[i] = scalar_cast<T>(THCNumerics<AccT>::mul(val, norm));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
|
|||
|
|
@ -170,6 +170,7 @@ __global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
|
|||
|
||||
// NOTE: curand_uniform is (0, 1] and we want [a, b)
|
||||
GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
|
||||
GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
|
||||
GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a)
|
||||
|
||||
GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ struct ScalarNegate<half> {
|
|||
return __float2half(-__half2float(v));
|
||||
#endif
|
||||
#else
|
||||
#if CUDA_VERSION < 9000
|
||||
#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
|
||||
half out = v;
|
||||
#else
|
||||
__half_raw out = __half_raw(v);
|
||||
|
|
@ -116,7 +116,7 @@ struct ScalarInv<half> {
|
|||
};
|
||||
|
||||
inline bool operator==(half a, half b) {
|
||||
#if CUDA_VERSION < 9000
|
||||
#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
|
||||
return a.x == b.x;
|
||||
#else
|
||||
__half_raw araw, braw;
|
||||
|
|
@ -127,7 +127,7 @@ inline bool operator==(half a, half b) {
|
|||
}
|
||||
|
||||
inline bool operator!=(half a, half b) {
|
||||
#if CUDA_VERSION < 9000
|
||||
#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
|
||||
return a.x != b.x;
|
||||
#else
|
||||
__half_raw araw, braw;
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ struct TensorSigmoidOp<half> {
|
|||
__device__ __forceinline__ void operator()(half* out, half* in) const {
|
||||
#ifdef CUDA_HALF_INSTRUCTIONS
|
||||
half one = ScalarConvert<int, half>::to(1);
|
||||
*out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
|
||||
*out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
|
||||
#else
|
||||
float fin = ScalarConvert<half, float>::to(*in);
|
||||
*out = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fin)));
|
||||
|
|
@ -33,7 +33,7 @@ struct TensorSigmoidOp<half> {
|
|||
__device__ __forceinline__ void operator()(half* v) const {
|
||||
#ifdef CUDA_HALF_INSTRUCTIONS
|
||||
half one = ScalarConvert<int, half>::to(1);
|
||||
*v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
|
||||
*v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
|
||||
#else
|
||||
float fv = ScalarConvert<half, float>::to(*v);
|
||||
*v = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fv)));
|
||||
|
|
|
|||
|
|
@ -42,6 +42,12 @@ if(BUILD_ATEN)
|
|||
list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
|
||||
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
|
||||
list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
|
||||
|
||||
IF(USE_ROCM)
|
||||
# Set the HIP Variables
|
||||
set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
|
||||
set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
|
||||
ENDIF(USE_ROCM)
|
||||
endif()
|
||||
|
||||
# ---[ Caffe2 build
|
||||
|
|
@ -205,7 +211,7 @@ caffe2_interface_library(caffe2 caffe2_library)
|
|||
list(APPEND Caffe2_MAIN_LIBS caffe2_library)
|
||||
|
||||
# ---[ CUDA library.
|
||||
if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
|
||||
if(USE_CUDA)
|
||||
# A hack to deal with cuda library dependencies and modern CMake: the
|
||||
# CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
|
||||
# one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
|
||||
|
|
@ -243,31 +249,46 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
|
|||
endif()
|
||||
|
||||
# ---[ Caffe2 HIP sources.
|
||||
if(BUILD_CAFFE2)
|
||||
if(USE_ROCM)
|
||||
HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
|
||||
set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
|
||||
if(USE_ROCM)
|
||||
HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES})
|
||||
|
||||
target_include_directories(
|
||||
caffe2_hip PUBLIC ${Caffe2_HIP_INCLUDES})
|
||||
target_include_directories(
|
||||
caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
IF(BUILD_ATEN)
|
||||
# Set necessary HIPCC Flags
|
||||
SET(HIP_HCC_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
|
||||
SET(Caffe2_HIP_CXX_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${Caffe2_HIP_CXX_FLAGS}")
|
||||
ENDIF()
|
||||
|
||||
target_link_libraries(caffe2_hip PUBLIC caffe2)
|
||||
target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
|
||||
# Since the HIP_ADD_LIBRARY is a MACRO, we need to set HIP_HCC_FLAGS prior to calling it.
|
||||
# Also, Since HIP_INCLUDE_DIRECTORIES is a MACRO, we must call it before HIP_ADD_LIBRARY.
|
||||
HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
|
||||
|
||||
set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
|
||||
set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
|
||||
|
||||
caffe2_interface_library(caffe2_hip caffe2_hip_library)
|
||||
list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
|
||||
install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
|
||||
endif()
|
||||
target_include_directories(
|
||||
caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES})
|
||||
target_include_directories(
|
||||
caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
|
||||
|
||||
IF(BUILD_ATEN)
|
||||
aten_set_target_props(caffe2_hip)
|
||||
ENDIF()
|
||||
|
||||
target_link_libraries(caffe2_hip PUBLIC caffe2)
|
||||
target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
|
||||
|
||||
# https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine
|
||||
# To avoid having to do above, keep this commented.
|
||||
set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
|
||||
|
||||
caffe2_interface_library(caffe2_hip caffe2_hip_library)
|
||||
list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
|
||||
install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
|
||||
endif()
|
||||
|
||||
# ---[ Check if warnings should be errors.
|
||||
if ($ENV{WERROR})
|
||||
target_compile_options(caffe2 PRIVATE -Werror)
|
||||
if(USE_CUDA OR USE_ROCM)
|
||||
if(USE_CUDA)
|
||||
target_compile_options(caffe2_gpu PRIVATE -Werror)
|
||||
endif()
|
||||
endif()
|
||||
|
|
@ -315,7 +336,8 @@ if(BUILD_CAFFE2)
|
|||
# Aten tests should only run when Caffe2 is not built
|
||||
set(__aten_test_dir "test/aten")
|
||||
endif()
|
||||
if(BUILD_ATEN)
|
||||
# Todo - Set up ATen tests for ROCm in an upcoming PR
|
||||
if(BUILD_ATEN AND NOT USE_ROCM)
|
||||
foreach(test_src ${ATen_CPU_TEST_SRCS})
|
||||
get_filename_component(test_name ${test_src} NAME_WE)
|
||||
add_executable(${test_name} "${test_src}")
|
||||
|
|
|
|||
|
|
@ -442,8 +442,8 @@ if(USE_CUDA)
|
|||
endif()
|
||||
|
||||
# ---[ HIP
|
||||
if(BUILD_CAFFE2)
|
||||
include(cmake/public/LoadHIP.cmake)
|
||||
if(BUILD_CAFFE2 OR BUILD_ATEN)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
|
||||
if(PYTORCH_FOUND_HIP)
|
||||
message(INFO "Compiling with HIP for AMD.")
|
||||
caffe2_update_option(USE_ROCM ON)
|
||||
|
|
@ -452,8 +452,11 @@ if(BUILD_CAFFE2)
|
|||
set(Caffe2_HIP_INCLUDES
|
||||
${hip_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${Caffe2_HIP_INCLUDES} ${thrust_INCLUDE_DIRS})
|
||||
set(Caffe2_HIP_DEPENDENCY_LIBS
|
||||
${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES})
|
||||
|
||||
${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipblas_LIBRARIES})
|
||||
# Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image)
|
||||
if(BUILD_ATEN)
|
||||
set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES} ${hiprng_LIBRARIES})
|
||||
endif()
|
||||
# TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES}
|
||||
list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
|
||||
roc::rocblas)
|
||||
|
|
@ -475,7 +478,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2)
|
|||
EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
|
||||
|
||||
# Link with HIPCC https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#linking-with-hipcc
|
||||
SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
|
||||
# SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Show message that we're using ROCm.
|
||||
MESSAGE(STATUS "ROCM TRUE:")
|
||||
|
|
|
|||
|
|
@ -103,11 +103,15 @@ IF(HIP_FOUND)
|
|||
set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
|
||||
set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
|
||||
set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
|
||||
set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas)
|
||||
set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
|
||||
|
||||
find_package(rocrand REQUIRED)
|
||||
find_package(hiprand REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
find_package(miopen REQUIRED)
|
||||
#find_package(hipblas REQUIRED) There's a bug with the CMake file in the Hipblas package.
|
||||
#find_package(hipsparse REQUIRED)
|
||||
|
||||
# TODO: hip_hcc has an interface include flag "-hc" which is only
|
||||
# recognizable by hcc, but not gcc and clang. Right now in our
|
||||
|
|
@ -117,6 +121,10 @@ IF(HIP_FOUND)
|
|||
# TODO: miopen_LIBRARIES should return fullpath to the library file,
|
||||
# however currently it's just the lib name
|
||||
FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
|
||||
FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
|
||||
FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib)
|
||||
FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib)
|
||||
FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
|
||||
|
||||
set(thrust_INCLUDE_DIRS ${THRUST_PATH} ${THRUST_PATH}/thrust/system/cuda/detail/cub-hip)
|
||||
|
||||
|
|
|
|||
|
|
@ -34,8 +34,10 @@ if [[ "$image" == *cuda* ]]; then
|
|||
DOCKERFILE="${OS}-cuda/Dockerfile"
|
||||
fi
|
||||
|
||||
# TODO: the version number here actually doesn't do anything at the
|
||||
# moment
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+.\d+)/ && print $1')"
|
||||
ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+\.\d+|nightly)/ && print $1')"
|
||||
DOCKERFILE="${OS}-rocm/Dockerfile"
|
||||
fi
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,25 @@
|
|||
|
||||
set -ex
|
||||
|
||||
# TODO: This script should install a SPECIFIC ROCM_VERSION, but actually
|
||||
# it ignores all values of ROCM_VERSION which are not nightly. Ugh!
|
||||
[ -n "$ROCM_VERSION" ]
|
||||
|
||||
install_hip_nightly() {
|
||||
git clone https://github.com/ROCm-Developer-Tools/HIP.git
|
||||
pushd HIP
|
||||
export HIP_PLATFORM=hcc
|
||||
yes | ./install.sh --install
|
||||
popd
|
||||
rm -rf HIP
|
||||
|
||||
git clone https://github.com/ROCmSoftwarePlatform/hipBLAS.git
|
||||
pushd hipBLAS
|
||||
yes | ./install.sh --install
|
||||
popd
|
||||
rm -rf hipBLAS
|
||||
}
|
||||
|
||||
install_ubuntu() {
|
||||
apt-get update
|
||||
apt-get install -y wget
|
||||
|
|
@ -36,11 +53,28 @@ install_centos() {
|
|||
install_hip_thrust() {
|
||||
# Needed for now, will be replaced soon
|
||||
git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust
|
||||
pushd /data/Thrust
|
||||
curl https://patch-diff.githubusercontent.com/raw/ROCmSoftwarePlatform/Thrust/pull/12.patch -o 12.patch
|
||||
patch -p1 < 12.patch
|
||||
rm 12.patch
|
||||
popd
|
||||
rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip
|
||||
git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip
|
||||
cd /data/Thrust/thrust/system/cuda/detail/cub-hip && git checkout hip_port_1.7.4_caffe2 && cd -
|
||||
}
|
||||
|
||||
install_hcrng() {
|
||||
mkdir -p /opt/rocm/debians
|
||||
curl https://s3.amazonaws.com/ossci-linux/hcrng-master-a8c6a0b-Linux.deb -o /opt/rocm/debians/hcrng.deb
|
||||
dpkg -i /opt/rocm/debians/hcrng.deb
|
||||
}
|
||||
|
||||
install_hcsparse() {
|
||||
mkdir -p /opt/rocm/debians
|
||||
curl https://s3.amazonaws.com/ossci-linux/hcsparse-master-907a505-Linux.deb -o /opt/rocm/debians/hcsparse.deb
|
||||
dpkg -i /opt/rocm/debians/hcsparse.deb
|
||||
}
|
||||
|
||||
|
||||
# Install Python packages depending on the base OS
|
||||
if [ -f /etc/lsb-release ]; then
|
||||
|
|
@ -52,4 +86,12 @@ else
|
|||
exit 1
|
||||
fi
|
||||
|
||||
# NB: We first install the "wrong" version, but then use those dev tools
|
||||
# to install the newer version of HIP.
|
||||
if [ "$ROCM_VERSION" = "nightly" ]; then
|
||||
install_hip_nightly
|
||||
fi
|
||||
|
||||
install_hip_thrust
|
||||
install_hcrng
|
||||
install_hcsparse
|
||||
|
|
|
|||
14
setup.py
14
setup.py
|
|
@ -104,6 +104,7 @@ import importlib
|
|||
|
||||
from tools.setup_helpers.env import check_env_flag
|
||||
from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME, CUDA_VERSION
|
||||
from tools.setup_helpers.rocm import WITH_ROCM, ROCM_HOME, ROCM_VERSION
|
||||
from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY,
|
||||
CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
|
||||
from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \
|
||||
|
|
@ -126,7 +127,6 @@ DEBUG = check_env_flag('DEBUG')
|
|||
IS_WINDOWS = (platform.system() == 'Windows')
|
||||
IS_DARWIN = (platform.system() == 'Darwin')
|
||||
IS_LINUX = (platform.system() == 'Linux')
|
||||
WITH_ROCM = check_env_flag('WITH_ROCM')
|
||||
|
||||
FULL_CAFFE2 = check_env_flag('FULL_CAFFE2')
|
||||
BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
|
||||
|
|
@ -631,8 +631,10 @@ library_dirs.append(lib_path)
|
|||
|
||||
# we specify exact lib names to avoid conflict with lua-torch installs
|
||||
CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.so')]
|
||||
if WITH_CUDA or WITH_ROCM:
|
||||
if WITH_CUDA:
|
||||
CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_gpu.so'), '-Wl,--as-needed'])
|
||||
if WITH_ROCM:
|
||||
CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed'])
|
||||
THD_LIB = os.path.join(lib_path, 'libTHD.a')
|
||||
NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
|
||||
C10D_LIB = os.path.join(lib_path, 'libc10d.a')
|
||||
|
|
@ -643,14 +645,18 @@ NANOPB_STATIC_LIB = os.path.join(lib_path, 'libprotobuf-nanopb.a')
|
|||
|
||||
if IS_DARWIN:
|
||||
CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.dylib')]
|
||||
if WITH_CUDA or WITH_ROCM:
|
||||
if WITH_CUDA:
|
||||
CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib'))
|
||||
if WITH_ROCM:
|
||||
CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib'))
|
||||
NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')
|
||||
|
||||
if IS_WINDOWS:
|
||||
CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')]
|
||||
if WITH_CUDA or WITH_ROCM:
|
||||
if WITH_CUDA:
|
||||
CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib'))
|
||||
if WITH_ROCM:
|
||||
CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib'))
|
||||
if DEBUG:
|
||||
NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopbd.lib')
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -2,12 +2,12 @@
|
|||
import shutil
|
||||
import subprocess
|
||||
import os
|
||||
import sys
|
||||
from shutil import copytree, ignore_patterns
|
||||
from functools import reduce
|
||||
|
||||
amd_build_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
proj_dir = os.path.dirname(os.path.dirname(amd_build_dir))
|
||||
out_dir = os.path.join(os.path.dirname(proj_dir), "pytorch_amd")
|
||||
include_dirs = [
|
||||
"aten",
|
||||
"torch"
|
||||
|
|
@ -16,16 +16,13 @@ include_dirs = [
|
|||
# List of operators currently disabled
|
||||
yaml_file = os.path.join(amd_build_dir, "disabled_features.yaml")
|
||||
|
||||
# Create the pytorch_amd directory
|
||||
shutil.copytree(proj_dir, out_dir)
|
||||
|
||||
# Apply patch files.
|
||||
patch_folder = os.path.join(amd_build_dir, "patches")
|
||||
for filename in os.listdir(os.path.join(amd_build_dir, "patches")):
|
||||
subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=out_dir)
|
||||
subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=proj_dir)
|
||||
|
||||
# HIPCC Compiler doesn't provide host defines - Automatically include them.
|
||||
for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")):
|
||||
for root, _, files in os.walk(os.path.join(proj_dir, "aten/src/ATen")):
|
||||
for filename in files:
|
||||
if filename.endswith(".cu") or filename.endswith(".cuh"):
|
||||
filepath = os.path.join(root, filename)
|
||||
|
|
@ -45,7 +42,7 @@ for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")):
|
|||
# Make various replacements inside AMD_BUILD/torch directory
|
||||
ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp",
|
||||
"csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"]
|
||||
for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
|
||||
for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
|
||||
for filename in files:
|
||||
if filename.endswith(".cpp") or filename.endswith(".h"):
|
||||
source = os.path.join(root, filename)
|
||||
|
|
@ -53,7 +50,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
|
|||
if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
|
||||
continue
|
||||
# Update contents.
|
||||
with open(source, "r+", encoding="utf-8") as f:
|
||||
with open(source, "r+") as f:
|
||||
contents = f.read()
|
||||
contents = contents.replace("WITH_CUDA", "WITH_ROCM")
|
||||
contents = contents.replace("CUDA_VERSION", "0")
|
||||
|
|
@ -64,9 +61,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
|
|||
os.fsync(f)
|
||||
|
||||
# Execute the Hipify Script.
|
||||
subprocess.Popen(
|
||||
["/opt/rocm/bin/hipify-python.py",
|
||||
"--project-directory", proj_dir,
|
||||
"--output-directory", out_dir,
|
||||
"--include-dirs"] + include_dirs +
|
||||
["--yaml-settings", yaml_file, "--add-static-casts", "True"])
|
||||
args = ["--project-directory", proj_dir,
|
||||
"--output-directory", proj_dir,
|
||||
"--include-dirs"] + include_dirs + ["--yaml-settings", yaml_file, "--add-static-casts", "True"]
|
||||
os.execv("/opt/rocm/bin/hipify-python.py", ['python'] + args)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"disabled_hip_function_calls":
|
||||
"disable_unsupported_hip_calls":
|
||||
[
|
||||
{
|
||||
"path": "aten/src/THC/generic/THCTensorSort.cu",
|
||||
|
|
@ -72,11 +72,16 @@
|
|||
"#include <nvfunctional>": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"path": "aten/src/ATen/native/cuda/Distributions.cu",
|
||||
"s_constants": {
|
||||
"#include <nvfunctional>": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"path": "aten/src/THC/THCNumerics.cuh",
|
||||
"s_constants": {
|
||||
"#ifdef __CUDA_ARCH__": "#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)",
|
||||
"labs": "fabs",
|
||||
"#if CUDA_VERSION < 9000": "#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)"
|
||||
}
|
||||
},
|
||||
|
|
@ -129,10 +134,16 @@
|
|||
"disabled_modules": [
|
||||
"aten/src/ATen/native/cuda/CuFFTUtils.h",
|
||||
"aten/src/ATen/native/cuda/SpectralOps.cu",
|
||||
"aten/src/ATen/native/cuda/Distributions.cu",
|
||||
"aten/src/THCUNN/RReLU.cu"
|
||||
"aten/src/THCUNN/RReLU.cu",
|
||||
"aten/src/ATen/native/cuda/Distributions.cu"
|
||||
],
|
||||
"disabled_functions": [
|
||||
{
|
||||
"path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh",
|
||||
"functions": [
|
||||
"kernelPointwiseApply4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"path": "aten/src/ATen/cuda/detail/IndexUtils.cu",
|
||||
"non_device_functions": [
|
||||
|
|
@ -148,7 +159,9 @@
|
|||
{
|
||||
"path": "aten/src/ATen/native/cuda/Distributions.cu",
|
||||
"functions": [
|
||||
"_s_poisson_cuda"
|
||||
"_s_poisson_cuda",
|
||||
"poisson_cuda_kernel",
|
||||
"gamma_cuda_kernel"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,23 +0,0 @@
|
|||
diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
|
||||
index 2653fed0b..c4e9089e0 100644
|
||||
--- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
|
||||
+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
|
||||
@@ -19,13 +19,17 @@ inline __host__ __device__ float fmaxType(float x, half y) {
|
||||
}
|
||||
#endif
|
||||
|
||||
+/* In ROCm we have a conversion from half to __fp16, and then there's a
|
||||
+conversion operator from __fp16 to double (w/ the standard conversion
|
||||
+double to float), so comment out these two lines to prevent ambiguous calls
|
||||
+for fmaxType when half is passed in.
|
||||
inline __host__ __device__ float fmaxType(float x, float y) {
|
||||
return fmaxf(x, y);
|
||||
}
|
||||
|
||||
inline __host__ __device__ double fmaxType(double x, double y) {
|
||||
return fmax(x, y);
|
||||
-}
|
||||
+}*/
|
||||
|
||||
#ifdef CUDA_HALF_TENSOR
|
||||
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh
|
||||
index ca6bf7cbe..a523648f1 100644
|
||||
--- a/aten/src/THC/THCTensorMathReduce.cuh
|
||||
+++ b/aten/src/THC/THCTensorMathReduce.cuh
|
||||
@@ -105,7 +105,7 @@ struct SquareFunctor<ResT, half> {
|
||||
template <typename T>
|
||||
struct ReduceMin {
|
||||
inline __device__ T operator()(T a, T b) const {
|
||||
- return (THCNumerics<T>::lt(a, b) ||
|
||||
+ return ((int)THCNumerics<T>::sub(a, b) < 0 ||
|
||||
THCNumerics<T>::isnan(a)) ? a : b;
|
||||
}
|
||||
};
|
||||
@@ -113,7 +113,7 @@ struct ReduceMin {
|
||||
template <typename T>
|
||||
struct ReduceMax {
|
||||
inline __device__ T operator()(T a, T b) const {
|
||||
- return (THCNumerics<T>::gt(a, b) ||
|
||||
+ return ((int)THCNumerics<T>::sub(a, b) > 0 ||
|
||||
THCNumerics<T>::isnan(a)) ? a : b;
|
||||
}
|
||||
};
|
||||
@@ -167,7 +167,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
|
||||
|
||||
buffer[tx] = ScalarConvert<int, Real>::to(0);
|
||||
Real norm;
|
||||
-
|
||||
+ #if !defined(__HIP_DEVICE_COMPILE__)
|
||||
if (THCNumerics<Real>::eq(value, ScalarConvert<float, Real>::to(INFINITY))) {
|
||||
// get norm of axis
|
||||
for (ptrdiff_t i=tx; i<size; i+=step)
|
||||
@@ -225,6 +225,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
|
||||
row[i] = THCNumerics<Real>::mul(row[i], norm);
|
||||
}
|
||||
}
|
||||
+ #endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
|
||||
index 906780b4f..c99f156ab 100644
|
||||
index 906780b4f..b03e051cb 100644
|
||||
--- a/aten/src/THC/generic/THCTensorRandom.cu
|
||||
+++ b/aten/src/THC/generic/THCTensorRandom.cu
|
||||
@@ -504,11 +504,11 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_
|
||||
|
|
@ -7,7 +7,7 @@ index 906780b4f..c99f156ab 100644
|
|||
if (range > 1ULL << 32) {
|
||||
generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
|
||||
- gen->state.gen_states, size, data, min_val, range);
|
||||
+ gen->state.gen_states, static_cast<int>(size), data, static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(min_val), static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(range));
|
||||
+ gen->state.gen_states, static_cast<int>(size), data, min_val, range);
|
||||
} else {
|
||||
#endif
|
||||
generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
|
||||
|
|
@ -17,7 +17,7 @@ index 906780b4f..c99f156ab 100644
|
|||
}
|
||||
#endif
|
||||
@@ -534,19 +534,19 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
|
||||
|
||||
|
||||
#if defined(THC_REAL_IS_HALF)
|
||||
generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
|
||||
- gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1);
|
||||
|
|
@ -39,5 +39,5 @@ index 906780b4f..c99f156ab 100644
|
|||
- gen->state.gen_states, size, data, 0UL, static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
|
||||
+ gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(0UL), static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
|
||||
#endif
|
||||
|
||||
|
||||
THCTensor_(freeCopyTo)(state, self, self_);
|
||||
|
|
|
|||
5
tools/setup_helpers/rocm.py
Normal file
5
tools/setup_helpers/rocm.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
from .env import check_env_flag
|
||||
# Check if ROCM is enabled
|
||||
WITH_ROCM = check_env_flag('WITH_ROCM')
|
||||
ROCM_HOME = "/opt/rocm"
|
||||
ROCM_VERSION = ""
|
||||
Loading…
Reference in a new issue