Get ROCm building again on master (#8343)

Billing of changes: - New Jenkins script for building on rocm. For now it is a bit hacked together, but we can improve it once CI is running - New ROCM docker image for nightly HIP, and also some legacy packages that we need temporarily - New enabled config py2-clang3.8-rocmnightly-ubuntu16.04-build based off of the existing Caffe2 image (not built yet) - A big pile of cmake fixes, mostly to turn bits on/off when ROCM build is involved - Switch from hiprng to hcrng - Apply some patches directly in code, eliminating the patches - Use __hdiv instead of hdiv, it's more portable - THCNumerics<T>::gt doesn't work in HIP, so simulate it with sub - Add a few more overloads HIP needs - Turn off use of hcc to link (we plan to turn this back on to get tests running) - Search for hiprand, hiprng, hipblas, hipsparse - Better Python 2 portability
2026-05-15 21:00:47 +00:00 · 2018-06-12 23:05:21 -04:00 · 2018-06-12 23:05:21 -04:00 · 81b92f7515
commit 81b92f7515
parent 49d6c5f99f
22 changed files with 187 additions and 132 deletions
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -23,6 +23,21 @@ gcc --version
 # TODO: Don't run this...
 pip install -r requirements.txt || true

+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  export HCC_AMDGPU_TARGET=gfx900
+
+  # TODO: Install pyHIPIFY in the docker image
+  rm -rf pyHIPIFY || true
+  git clone https://github.com/ROCm-Developer-Tools/pyHIPIFY.git
+  chmod a+x pyHIPIFY/*.py
+  sudo cp -p pyHIPIFY/*.py /opt/rocm/bin
+  rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true
+  python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py"
+  HIPCC_VERBOSE=1 VERBOSE=1 WITH_ROCM=1 python setup.py install
+  exit
+fi
+
+# TODO: Don't install this here
 if ! which conda; then
  pip install mkl mkl-devel
 fi
--- a/.jenkins/pytorch/enabled-configs.txt
+++ b/.jenkins/pytorch/enabled-configs.txt
@ -36,3 +36,4 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
 pytorch-docker-build-test
 short-perf-test-cpu
 short-perf-test-gpu
+py2-clang3.8-rocmnightly-ubuntu16.04-build
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -203,7 +203,7 @@ if(NOT MSVC)
  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
 endif()

-IF(USE_CUDA)
+IF(USE_CUDA AND NOT USE_ROCM)
  IF ($ENV{ATEN_STATIC_CUDA})
    # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
    # we first have to build a fake lib that links with no device callbacks,
@ -250,6 +250,10 @@ IF(USE_CUDA)
      ${CUDA_LIBRARIES}
      ${CUDA_cusparse_LIBRARY}
      ${CUDA_curand_LIBRARY})
+
+      # Set necessary HIPCC/CXX Flags
+      SET(HIP_HCC_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
+      SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${CMAKE_CXX_FLAGS}")
  ENDIF()

  if(CUDNN_FOUND)
@ -271,7 +275,7 @@ ENDIF()
 IF(USE_ROCM)
 ### Link in the ROCm libraries BLAS / RNG.
 FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib)
- FIND_LIBRARY(HIPRNG_LIBRARY hiprng HINTS ${HIPRNG_PATH}/lib)
+ FIND_LIBRARY(HIPRNG_LIBRARY hcrng HINTS ${HIPRNG_PATH}/lib)

 list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY})
 ENDIF()
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@ -103,7 +103,7 @@ static inline  __device__ void atomicAdd(half *address, half val) {

  do {
    assumed = old;
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
    half hsum;
    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
    hsum = THCNumerics<half>::add(hsum, val);
@ -135,7 +135,7 @@ static inline  __device__  void atomicAdd(double *address, double val) {
    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
 } while (assumed != old);
 }
-#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000)
+#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__)
  // This needs to be defined for the host side pass
  static inline  __device__  void atomicAdd(double *address, double val) { }
 #endif
--- a/aten/src/THC/THCHalf.h
+++ b/aten/src/THC/THCHalf.h
@ -4,22 +4,16 @@
 #include "THCGeneral.h"

 /* We compile with CudaHalfTensor support if we have this: */
-#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
+#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__)
 #define CUDA_HALF_TENSOR 1
 #endif

-/* For HIP, rely on the half instructions as well.*/
-#if defined(__HIP_PLATFORM_HCC__)
-#define CUDA_HALF_TENSOR 1
-#define CUDA_HALF_INSTRUCTIONS 1
-#endif
-
 #ifdef CUDA_HALF_TENSOR

 #include <cuda_fp16.h>
 #include <stdint.h>

-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
 #ifndef __cplusplus
 typedef __half_raw half;
 #endif
--- a/aten/src/THC/THCTensorMathPointwise.cuh
+++ b/aten/src/THC/THCTensorMathPointwise.cuh
@ -37,7 +37,7 @@ struct TensorSigmoidOp<half> {
  __device__ __forceinline__ void operator()(half* out, half* in) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
    half one = ScalarConvert<int, half>::to(1);
-    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
+    *out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
 #else
    float fin = __half2float(*in);
    *out = __float2half(1.0f / (1.0f + expf(- fin)));
@ -47,7 +47,7 @@ struct TensorSigmoidOp<half> {
  __device__ __forceinline__ void operator()(half* v) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
    half one = ScalarConvert<int, half>::to(1);
-    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
+    *v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
 #else
    float fv = __half2float(*v);
    *v = __float2half(1.0f / (1.0f + expf(- fv)));
--- a/aten/src/THC/THCTensorMathReduce.cuh
+++ b/aten/src/THC/THCTensorMathReduce.cuh
@ -80,7 +80,11 @@ struct ReduceMin {
 template <typename T>
 struct ReduceMax {
  inline __device__ T operator()(T a, T b) const {
+#if defined(__HIP_PLATFORM_HCC__)
+    return (static_cast<int>(THCNumerics<T>::sub(a, b)) > 0 || THCNumerics<T>::isnan(a)) ? a : b;
+#else
    return (THCNumerics<T>::gt(a, b) || THCNumerics<T>::isnan(a)) ? a : b;
+#endif
  }
 };

@ -117,6 +121,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
  buffer[tx] = scalar_cast<AccT>(0);
  AccT norm;

+#if !defined(__HIP_DEVICE_COMPILE__)
  if (THCNumerics<AccT>::eq(value, scalar_cast<AccT, float>(INFINITY))) {
    // get norm of axis
    for (ptrdiff_t i = tx; i < size; i += step) {
@ -163,6 +168,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
      row[i] = scalar_cast<T>(THCNumerics<AccT>::mul(val, norm));
    }
  }
+#endif
 }

 template <typename T>
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@ -170,6 +170,7 @@ __global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,

 // NOTE: curand_uniform is (0, 1] and we want [a, b)
 GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
+GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
 GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a)

 GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
--- a/aten/src/THC/THCTensorTypeUtils.cuh
+++ b/aten/src/THC/THCTensorTypeUtils.cuh
@ -91,7 +91,7 @@ struct ScalarNegate<half> {
    return __float2half(-__half2float(v));
 #endif
 #else
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
    half out = v;
 #else
    __half_raw out = __half_raw(v);
@ -116,7 +116,7 @@ struct ScalarInv<half> {
 };

 inline bool operator==(half a, half b) {
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
  return a.x == b.x;
 #else
  __half_raw araw, braw;
@ -127,7 +127,7 @@ inline bool operator==(half a, half b) {
 }

 inline bool operator!=(half a, half b) {
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
    return a.x != b.x;
 #else
  __half_raw araw, braw;
--- a/aten/src/THCUNN/FusedRNNKernel.cu
+++ b/aten/src/THCUNN/FusedRNNKernel.cu
@ -23,7 +23,7 @@ struct TensorSigmoidOp<half> {
  __device__ __forceinline__ void operator()(half* out, half* in) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
    half one = ScalarConvert<int, half>::to(1);
-    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
+    *out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
 #else
    float fin = ScalarConvert<half, float>::to(*in);
    *out = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fin)));
@ -33,7 +33,7 @@ struct TensorSigmoidOp<half> {
  __device__ __forceinline__ void operator()(half* v) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
    half one = ScalarConvert<int, half>::to(1);
-    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
+    *v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
 #else
    float fv = ScalarConvert<half, float>::to(*v);
    *v = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fv)));
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -42,6 +42,12 @@ if(BUILD_ATEN)
  list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
  list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
  list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
+
+  IF(USE_ROCM)
+    # Set the HIP Variables
+    set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
+    set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
+  ENDIF(USE_ROCM)
 endif()

 # ---[ Caffe2 build
@ -205,7 +211,7 @@ caffe2_interface_library(caffe2 caffe2_library)
 list(APPEND Caffe2_MAIN_LIBS caffe2_library)

 # ---[ CUDA library.
-if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
+if(USE_CUDA)
  # A hack to deal with cuda library dependencies and modern CMake: the
  # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
  # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
@ -243,31 +249,46 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
 endif()

 # ---[ Caffe2 HIP sources.
-if(BUILD_CAFFE2)
-  if(USE_ROCM)
-    HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
-    set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
+if(USE_ROCM)
+  HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES})

-    target_include_directories(
-      caffe2_hip PUBLIC ${Caffe2_HIP_INCLUDES})
-    target_include_directories(
-      caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
+  IF(BUILD_ATEN)
+    # Set necessary HIPCC Flags
+    SET(HIP_HCC_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
+    SET(Caffe2_HIP_CXX_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${Caffe2_HIP_CXX_FLAGS}")
+  ENDIF()

-    target_link_libraries(caffe2_hip PUBLIC caffe2)
-    target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
+  # Since the HIP_ADD_LIBRARY is a MACRO, we need to set HIP_HCC_FLAGS prior to calling it.
+  # Also, Since HIP_INCLUDE_DIRECTORIES is a MACRO, we must call it before HIP_ADD_LIBRARY.
+  HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})

-    set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
+  set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})

-    caffe2_interface_library(caffe2_hip caffe2_hip_library)
-    list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
-    install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
-  endif()
+  target_include_directories(
+    caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES})
+  target_include_directories(
+    caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
+
+  IF(BUILD_ATEN)
+    aten_set_target_props(caffe2_hip)
+  ENDIF()
+
+  target_link_libraries(caffe2_hip PUBLIC caffe2)
+  target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
+
+  # https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine
+  # To avoid having to do above, keep this commented.
+  set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
+
+  caffe2_interface_library(caffe2_hip caffe2_hip_library)
+  list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
+  install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
 endif()

 # ---[ Check if warnings should be errors.
 if ($ENV{WERROR})
  target_compile_options(caffe2 PRIVATE -Werror)
-  if(USE_CUDA OR USE_ROCM)
+  if(USE_CUDA)
    target_compile_options(caffe2_gpu PRIVATE -Werror)
  endif()
 endif()
@ -315,7 +336,8 @@ if(BUILD_CAFFE2)
  # Aten tests should only run when Caffe2 is not built
  set(__aten_test_dir "test/aten")
 endif()
-if(BUILD_ATEN)
+# Todo - Set up ATen tests for ROCm in an upcoming PR
+if(BUILD_ATEN AND NOT USE_ROCM)
  foreach(test_src ${ATen_CPU_TEST_SRCS})
    get_filename_component(test_name ${test_src} NAME_WE)
    add_executable(${test_name} "${test_src}")
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -442,8 +442,8 @@ if(USE_CUDA)
 endif()

 # ---[ HIP
-if(BUILD_CAFFE2)
-  include(cmake/public/LoadHIP.cmake)
+if(BUILD_CAFFE2 OR BUILD_ATEN)
+  include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
  if(PYTORCH_FOUND_HIP)
    message(INFO "Compiling with HIP for AMD.")
    caffe2_update_option(USE_ROCM ON)
@ -452,8 +452,11 @@ if(BUILD_CAFFE2)
    set(Caffe2_HIP_INCLUDES
      ${hip_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${Caffe2_HIP_INCLUDES} ${thrust_INCLUDE_DIRS})
    set(Caffe2_HIP_DEPENDENCY_LIBS
-      ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES})
-
+      ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipblas_LIBRARIES})
+    # Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image)
+    if(BUILD_ATEN)
+      set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES} ${hiprng_LIBRARIES})
+    endif()
    # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES}
    list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
      roc::rocblas)
@ -475,7 +478,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2)
 EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)

 # Link with HIPCC https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#linking-with-hipcc
- SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
+ # SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})

 # Show message that we're using ROCm.
 MESSAGE(STATUS "ROCM TRUE:")
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@ -103,11 +103,15 @@ IF(HIP_FOUND)
  set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
  set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
  set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
+  set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas)
+  set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)

  find_package(rocrand REQUIRED)
  find_package(hiprand REQUIRED)
  find_package(rocblas REQUIRED)
  find_package(miopen REQUIRED)
+  #find_package(hipblas REQUIRED) There's a bug with the CMake file in the Hipblas package.
+  #find_package(hipsparse REQUIRED)

  # TODO: hip_hcc has an interface include flag "-hc" which is only
  # recognizable by hcc, but not gcc and clang. Right now in our
@ -117,6 +121,10 @@ IF(HIP_FOUND)
  # TODO: miopen_LIBRARIES should return fullpath to the library file,
  # however currently it's just the lib name
  FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
+  FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
+  FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib)
+  FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib)
+  FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)

  set(thrust_INCLUDE_DIRS ${THRUST_PATH} ${THRUST_PATH}/thrust/system/cuda/detail/cub-hip)

--- a/docker/caffe2/jenkins/build.sh
+++ b/docker/caffe2/jenkins/build.sh
@ -34,8 +34,10 @@ if [[ "$image" == *cuda* ]]; then
  DOCKERFILE="${OS}-cuda/Dockerfile"
 fi

+# TODO: the version number here actually doesn't do anything at the
+# moment
 if [[ "$image" == *rocm* ]]; then
-  ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+.\d+)/ && print $1')"
+  ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+\.\d+|nightly)/ && print $1')"
  DOCKERFILE="${OS}-rocm/Dockerfile"
 fi

--- a/docker/caffe2/jenkins/common/install_rocm.sh
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@ -2,8 +2,25 @@

 set -ex

+# TODO: This script should install a SPECIFIC ROCM_VERSION, but actually
+# it ignores all values of ROCM_VERSION which are not nightly.  Ugh!
 [ -n "$ROCM_VERSION" ]

+install_hip_nightly() {
+    git clone https://github.com/ROCm-Developer-Tools/HIP.git
+    pushd HIP
+    export HIP_PLATFORM=hcc
+    yes | ./install.sh --install
+    popd
+    rm -rf HIP
+
+    git clone https://github.com/ROCmSoftwarePlatform/hipBLAS.git
+    pushd hipBLAS
+    yes | ./install.sh --install
+    popd
+    rm -rf hipBLAS
+}
+
 install_ubuntu() {
    apt-get update
    apt-get install -y wget
@ -36,11 +53,28 @@ install_centos() {
 install_hip_thrust() {
    # Needed for now, will be replaced soon
    git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust
+    pushd /data/Thrust
+    curl https://patch-diff.githubusercontent.com/raw/ROCmSoftwarePlatform/Thrust/pull/12.patch -o 12.patch
+    patch -p1 < 12.patch
+    rm 12.patch
+    popd
    rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip
    git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip
    cd /data/Thrust/thrust/system/cuda/detail/cub-hip && git checkout hip_port_1.7.4_caffe2 && cd -
 }

+install_hcrng() {
+    mkdir -p /opt/rocm/debians
+    curl https://s3.amazonaws.com/ossci-linux/hcrng-master-a8c6a0b-Linux.deb -o /opt/rocm/debians/hcrng.deb 
+    dpkg -i /opt/rocm/debians/hcrng.deb
+}
+
+install_hcsparse() {
+    mkdir -p /opt/rocm/debians
+    curl https://s3.amazonaws.com/ossci-linux/hcsparse-master-907a505-Linux.deb -o /opt/rocm/debians/hcsparse.deb
+    dpkg -i /opt/rocm/debians/hcsparse.deb
+}
+

 # Install Python packages depending on the base OS
 if [ -f /etc/lsb-release ]; then
@ -52,4 +86,12 @@ else
  exit 1
 fi

+# NB: We first install the "wrong" version, but then use those dev tools
+# to install the newer version of HIP.
+if [ "$ROCM_VERSION" = "nightly" ]; then
+  install_hip_nightly
+fi
+
 install_hip_thrust
+install_hcrng
+install_hcsparse
--- a/setup.py
+++ b/setup.py
@ -104,6 +104,7 @@ import importlib

 from tools.setup_helpers.env import check_env_flag
 from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME, CUDA_VERSION
+from tools.setup_helpers.rocm import WITH_ROCM, ROCM_HOME, ROCM_VERSION
 from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY,
                                       CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
 from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \
@ -126,7 +127,6 @@ DEBUG = check_env_flag('DEBUG')
 IS_WINDOWS = (platform.system() == 'Windows')
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
-WITH_ROCM = check_env_flag('WITH_ROCM')

 FULL_CAFFE2 = check_env_flag('FULL_CAFFE2')
 BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
@ -631,8 +631,10 @@ library_dirs.append(lib_path)

 # we specify exact lib names to avoid conflict with lua-torch installs
 CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.so')]
-if WITH_CUDA or WITH_ROCM:
+if WITH_CUDA:
    CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_gpu.so'), '-Wl,--as-needed'])
+if WITH_ROCM:
+    CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed'])
 THD_LIB = os.path.join(lib_path, 'libTHD.a')
 NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
 C10D_LIB = os.path.join(lib_path, 'libc10d.a')
@ -643,14 +645,18 @@ NANOPB_STATIC_LIB = os.path.join(lib_path, 'libprotobuf-nanopb.a')

 if IS_DARWIN:
    CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.dylib')]
-    if WITH_CUDA or WITH_ROCM:
+    if WITH_CUDA:
        CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib'))
+    if WITH_ROCM:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib'))
    NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')

 if IS_WINDOWS:
    CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')]
-    if WITH_CUDA or WITH_ROCM:
+    if WITH_CUDA:
        CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib'))
+    if WITH_ROCM:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib'))
    if DEBUG:
        NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopbd.lib')
    else:
--- a/tools/amd_build/build_pytorch_amd.py
+++ b/tools/amd_build/build_pytorch_amd.py
@ -2,12 +2,12 @@
 import shutil
 import subprocess
 import os
+import sys
 from shutil import copytree, ignore_patterns
 from functools import reduce

 amd_build_dir = os.path.dirname(os.path.realpath(__file__))
 proj_dir = os.path.dirname(os.path.dirname(amd_build_dir))
-out_dir = os.path.join(os.path.dirname(proj_dir), "pytorch_amd")
 include_dirs = [
    "aten",
    "torch"
@ -16,16 +16,13 @@ include_dirs = [
 # List of operators currently disabled
 yaml_file = os.path.join(amd_build_dir, "disabled_features.yaml")

-# Create the pytorch_amd directory
-shutil.copytree(proj_dir, out_dir)
-
 # Apply patch files.
 patch_folder = os.path.join(amd_build_dir, "patches")
 for filename in os.listdir(os.path.join(amd_build_dir, "patches")):
-    subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=out_dir)
+    subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=proj_dir)

 # HIPCC Compiler doesn't provide host defines - Automatically include them.
-for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")):
+for root, _, files in os.walk(os.path.join(proj_dir, "aten/src/ATen")):
    for filename in files:
        if filename.endswith(".cu") or filename.endswith(".cuh"):
            filepath = os.path.join(root, filename)
@ -45,7 +42,7 @@ for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")):
 # Make various replacements inside AMD_BUILD/torch directory
 ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp",
                "csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"]
-for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
+for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
    for filename in files:
        if filename.endswith(".cpp") or filename.endswith(".h"):
            source = os.path.join(root, filename)
@ -53,7 +50,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
            if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
                continue
            # Update contents.
-            with open(source, "r+", encoding="utf-8") as f:
+            with open(source, "r+") as f:
                contents = f.read()
                contents = contents.replace("WITH_CUDA", "WITH_ROCM")
                contents = contents.replace("CUDA_VERSION", "0")
@ -64,9 +61,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
                os.fsync(f)

 # Execute the Hipify Script.
-subprocess.Popen(
-    ["/opt/rocm/bin/hipify-python.py",
-        "--project-directory", proj_dir,
-        "--output-directory", out_dir,
-        "--include-dirs"] + include_dirs +
-    ["--yaml-settings", yaml_file, "--add-static-casts", "True"])
+args = ["--project-directory", proj_dir,
+        "--output-directory", proj_dir,
+        "--include-dirs"] + include_dirs + ["--yaml-settings", yaml_file, "--add-static-casts", "True"]
+os.execv("/opt/rocm/bin/hipify-python.py", ['python'] + args)
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@ -1,5 +1,5 @@
 {
-  "disabled_hip_function_calls":
+  "disable_unsupported_hip_calls":
    [
      {
        "path": "aten/src/THC/generic/THCTensorSort.cu",
@ -72,11 +72,16 @@
          "#include <nvfunctional>": ""
        }
      },
+      {
+        "path": "aten/src/ATen/native/cuda/Distributions.cu",
+        "s_constants": {
+          "#include <nvfunctional>": ""
+        }
+      },
      {
        "path": "aten/src/THC/THCNumerics.cuh",
        "s_constants": {
          "#ifdef __CUDA_ARCH__": "#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)",
-          "labs": "fabs",
          "#if CUDA_VERSION < 9000": "#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)"
        }
      },
@ -129,10 +134,16 @@
  "disabled_modules": [
    "aten/src/ATen/native/cuda/CuFFTUtils.h",
    "aten/src/ATen/native/cuda/SpectralOps.cu",
-    "aten/src/ATen/native/cuda/Distributions.cu",
-    "aten/src/THCUNN/RReLU.cu"
+    "aten/src/THCUNN/RReLU.cu",
+    "aten/src/ATen/native/cuda/Distributions.cu"
  ],
  "disabled_functions": [
+    {
+      "path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh",
+      "functions": [
+        "kernelPointwiseApply4"
+      ]
+    },
    {
      "path": "aten/src/ATen/cuda/detail/IndexUtils.cu",
      "non_device_functions": [
@ -148,7 +159,9 @@
    {
      "path": "aten/src/ATen/native/cuda/Distributions.cu",
      "functions": [
-        "_s_poisson_cuda"
+        "_s_poisson_cuda",
+        "poisson_cuda_kernel",
+        "gamma_cuda_kernel"
      ]
    },
    {
--- a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch
+++ b/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch
@ -1,23 +0,0 @@
-diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-index 2653fed0b..c4e9089e0 100644
--- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-@@ -19,13 +19,17 @@ inline __host__ __device__ float fmaxType(float x, half y) {
- }
- #endif
- 
-+/* In ROCm we have a conversion from half to __fp16, and then there's a
-+conversion operator from __fp16 to double (w/ the standard conversion
-+double to float), so comment out these two lines to prevent ambiguous calls
-+for fmaxType when half is passed in.
- inline __host__ __device__ float fmaxType(float x, float y) {
-   return fmaxf(x, y);
- }
- 
- inline __host__ __device__ double fmaxType(double x, double y) {
-   return fmax(x, y);
-}
-+}*/
- 
- #ifdef CUDA_HALF_TENSOR
- 
--- a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch
+++ b/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch
@ -1,39 +0,0 @@
-diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh
-index ca6bf7cbe..a523648f1 100644
--- a/aten/src/THC/THCTensorMathReduce.cuh
-+++ b/aten/src/THC/THCTensorMathReduce.cuh
-@@ -105,7 +105,7 @@ struct SquareFunctor<ResT, half> {
- template <typename T>
- struct ReduceMin {
-   inline __device__ T operator()(T a, T b) const {
-    return (THCNumerics<T>::lt(a, b) ||
-+    return ((int)THCNumerics<T>::sub(a, b) < 0 ||
-             THCNumerics<T>::isnan(a)) ? a : b;
-   }
- };
-@@ -113,7 +113,7 @@ struct ReduceMin {
- template <typename T>
- struct ReduceMax {
-   inline __device__ T operator()(T a, T b) const {
-    return (THCNumerics<T>::gt(a, b) ||
-+    return ((int)THCNumerics<T>::sub(a, b) > 0 ||
-             THCNumerics<T>::isnan(a)) ? a : b;
-   }
- };
-@@ -167,7 +167,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
- 
-   buffer[tx] = ScalarConvert<int, Real>::to(0);
-   Real norm;
-
-+  #if !defined(__HIP_DEVICE_COMPILE__)
-   if (THCNumerics<Real>::eq(value, ScalarConvert<float, Real>::to(INFINITY))) {
-     // get norm of axis
-     for (ptrdiff_t i=tx; i<size; i+=step)
-@@ -225,6 +225,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
-       row[i] = THCNumerics<Real>::mul(row[i], norm);
-     }
-   }
-+  #endif
- }
- 
- template <typename T>
--- a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch
+++ b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch
@ -1,5 +1,5 @@
 diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
-index 906780b4f..c99f156ab 100644
+index 906780b4f..b03e051cb 100644
 --- a/aten/src/THC/generic/THCTensorRandom.cu
 +++ b/aten/src/THC/generic/THCTensorRandom.cu
@@ -504,11 +504,11 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_
@ -7,7 +7,7 @@ index 906780b4f..c99f156ab 100644
   if (range > 1ULL << 32) {
     generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
 -        gen->state.gen_states, size, data, min_val, range);
-+        gen->state.gen_states, static_cast<int>(size), data, static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(min_val), static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(range));
+        gen->state.gen_states, static_cast<int>(size), data, min_val, range);
   } else {
 #endif
     generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@ -17,7 +17,7 @@ index 906780b4f..c99f156ab 100644
   }
 #endif
@@ -534,19 +534,19 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
-
+ 
 #if defined(THC_REAL_IS_HALF)
   generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
 -      gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1);
@ -39,5 +39,5 @@ index 906780b4f..c99f156ab 100644
 -      gen->state.gen_states, size, data, 0UL, static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
 +      gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(0UL), static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
 #endif
-
+ 
   THCTensor_(freeCopyTo)(state, self, self_);
--- a/tools/setup_helpers/rocm.py
+++ b/tools/setup_helpers/rocm.py
@ -0,0 +1,5 @@
+from .env import check_env_flag
+# Check if ROCM is enabled
+WITH_ROCM = check_env_flag('WITH_ROCM')
+ROCM_HOME = "/opt/rocm"
+ROCM_VERSION = ""