diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index c758b60a577..4d0d34db920 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -23,6 +23,21 @@ gcc --version
 # TODO: Don't run this...
 pip install -r requirements.txt || true
 
+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  export HCC_AMDGPU_TARGET=gfx900
+
+  # TODO: Install pyHIPIFY in the docker image
+  rm -rf pyHIPIFY || true
+  git clone https://github.com/ROCm-Developer-Tools/pyHIPIFY.git
+  chmod a+x pyHIPIFY/*.py
+  sudo cp -p pyHIPIFY/*.py /opt/rocm/bin
+  rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true
+  python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py"
+  HIPCC_VERBOSE=1 VERBOSE=1 WITH_ROCM=1 python setup.py install
+  exit
+fi
+
+# TODO: Don't install this here
 if ! which conda; then
   pip install mkl mkl-devel
 fi
diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt
index 763e95748b7..888e11e27bc 100644
--- a/.jenkins/pytorch/enabled-configs.txt
+++ b/.jenkins/pytorch/enabled-configs.txt
@@ -36,3 +36,4 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
 pytorch-docker-build-test
 short-perf-test-cpu
 short-perf-test-gpu
+py2-clang3.8-rocmnightly-ubuntu16.04-build
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index f5ce8080c60..38de24ebca0 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -203,7 +203,7 @@ if(NOT MSVC)
   set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
 endif()
 
-IF(USE_CUDA)
+IF(USE_CUDA AND NOT USE_ROCM)
   IF ($ENV{ATEN_STATIC_CUDA})
     # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
     # we first have to build a fake lib that links with no device callbacks,
@@ -250,6 +250,10 @@ IF(USE_CUDA)
       ${CUDA_LIBRARIES}
       ${CUDA_cusparse_LIBRARY}
       ${CUDA_curand_LIBRARY})
+
+      # Set necessary HIPCC/CXX Flags
+      SET(HIP_HCC_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
+      SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${CMAKE_CXX_FLAGS}")
   ENDIF()
 
   if(CUDNN_FOUND)
@@ -271,7 +275,7 @@ ENDIF()
 IF(USE_ROCM)
  ### Link in the ROCm libraries BLAS / RNG.
  FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib)
- FIND_LIBRARY(HIPRNG_LIBRARY hiprng HINTS ${HIPRNG_PATH}/lib)
+ FIND_LIBRARY(HIPRNG_LIBRARY hcrng HINTS ${HIPRNG_PATH}/lib)
 
  list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY})
 ENDIF()
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index 9e54c56dc45..e89bf424e30 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -103,7 +103,7 @@ static inline  __device__ void atomicAdd(half *address, half val) {
 
   do {
     assumed = old;
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
     half hsum;
     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
     hsum = THCNumerics<half>::add(hsum, val);
@@ -135,7 +135,7 @@ static inline  __device__  void atomicAdd(double *address, double val) {
     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
 } while (assumed != old);
 }
-#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000)
+#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__)
   // This needs to be defined for the host side pass
   static inline  __device__  void atomicAdd(double *address, double val) { }
 #endif
diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h
index 43ffd76e316..d9b8cba72d2 100644
--- a/aten/src/THC/THCHalf.h
+++ b/aten/src/THC/THCHalf.h
@@ -4,22 +4,16 @@
 #include "THCGeneral.h"
 
 /* We compile with CudaHalfTensor support if we have this: */
-#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
+#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__)
 #define CUDA_HALF_TENSOR 1
 #endif
 
-/* For HIP, rely on the half instructions as well.*/
-#if defined(__HIP_PLATFORM_HCC__)
-#define CUDA_HALF_TENSOR 1
-#define CUDA_HALF_INSTRUCTIONS 1
-#endif
-
 #ifdef CUDA_HALF_TENSOR
 
 #include <cuda_fp16.h>
 #include <stdint.h>
 
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
 #ifndef __cplusplus
 typedef __half_raw half;
 #endif
diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh
index 67dfaf06a43..26389c3a990 100644
--- a/aten/src/THC/THCTensorMathPointwise.cuh
+++ b/aten/src/THC/THCTensorMathPointwise.cuh
@@ -37,7 +37,7 @@ struct TensorSigmoidOp<half> {
   __device__ __forceinline__ void operator()(half* out, half* in) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     half one = ScalarConvert<int, half>::to(1);
-    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
+    *out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
 #else
     float fin = __half2float(*in);
     *out = __float2half(1.0f / (1.0f + expf(- fin)));
@@ -47,7 +47,7 @@ struct TensorSigmoidOp<half> {
   __device__ __forceinline__ void operator()(half* v) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     half one = ScalarConvert<int, half>::to(1);
-    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
+    *v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
 #else
     float fv = __half2float(*v);
     *v = __float2half(1.0f / (1.0f + expf(- fv)));
diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh
index 1002c4a926c..d3f290de1e3 100644
--- a/aten/src/THC/THCTensorMathReduce.cuh
+++ b/aten/src/THC/THCTensorMathReduce.cuh
@@ -80,7 +80,11 @@ struct ReduceMin {
 template <typename T>
 struct ReduceMax {
   inline __device__ T operator()(T a, T b) const {
+#if defined(__HIP_PLATFORM_HCC__)
+    return (static_cast<int>(THCNumerics<T>::sub(a, b)) > 0 || THCNumerics<T>::isnan(a)) ? a : b;
+#else
     return (THCNumerics<T>::gt(a, b) || THCNumerics<T>::isnan(a)) ? a : b;
+#endif
   }
 };
 
@@ -117,6 +121,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
   buffer[tx] = scalar_cast<AccT>(0);
   AccT norm;
 
+#if !defined(__HIP_DEVICE_COMPILE__)
   if (THCNumerics<AccT>::eq(value, scalar_cast<AccT, float>(INFINITY))) {
     // get norm of axis
     for (ptrdiff_t i = tx; i < size; i += step) {
@@ -163,6 +168,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
       row[i] = scalar_cast<T>(THCNumerics<AccT>::mul(val, norm));
     }
   }
+#endif
 }
 
 template <typename T>
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index 432138493dd..d05af2dbf59 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -170,6 +170,7 @@ __global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
 
 // NOTE: curand_uniform is (0, 1] and we want [a, b)
 GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
+GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
 GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a)
 
 GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh
index 3b52a02e763..99266b3b4b9 100644
--- a/aten/src/THC/THCTensorTypeUtils.cuh
+++ b/aten/src/THC/THCTensorTypeUtils.cuh
@@ -91,7 +91,7 @@ struct ScalarNegate<half> {
     return __float2half(-__half2float(v));
 #endif
 #else
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
     half out = v;
 #else
     __half_raw out = __half_raw(v);
@@ -116,7 +116,7 @@ struct ScalarInv<half> {
 };
 
 inline bool operator==(half a, half b) {
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
   return a.x == b.x;
 #else
   __half_raw araw, braw;
@@ -127,7 +127,7 @@ inline bool operator==(half a, half b) {
 }
 
 inline bool operator!=(half a, half b) {
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
     return a.x != b.x;
 #else
   __half_raw araw, braw;
diff --git a/aten/src/THCUNN/FusedRNNKernel.cu b/aten/src/THCUNN/FusedRNNKernel.cu
index 58e22f8cfb0..d8b594ab046 100644
--- a/aten/src/THCUNN/FusedRNNKernel.cu
+++ b/aten/src/THCUNN/FusedRNNKernel.cu
@@ -23,7 +23,7 @@ struct TensorSigmoidOp<half> {
   __device__ __forceinline__ void operator()(half* out, half* in) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     half one = ScalarConvert<int, half>::to(1);
-    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
+    *out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
 #else
     float fin = ScalarConvert<half, float>::to(*in);
     *out = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fin)));
@@ -33,7 +33,7 @@ struct TensorSigmoidOp<half> {
   __device__ __forceinline__ void operator()(half* v) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     half one = ScalarConvert<int, half>::to(1);
-    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
+    *v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
 #else
     float fv = ScalarConvert<half, float>::to(*v);
     *v = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fv)));
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index cc21ed93a19..722e6bf4311 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -42,6 +42,12 @@ if(BUILD_ATEN)
   list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
   list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
   list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
+
+  IF(USE_ROCM)
+    # Set the HIP Variables
+    set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
+    set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
+  ENDIF(USE_ROCM)
 endif()
 
 # ---[ Caffe2 build
@@ -205,7 +211,7 @@ caffe2_interface_library(caffe2 caffe2_library)
 list(APPEND Caffe2_MAIN_LIBS caffe2_library)
 
 # ---[ CUDA library.
-if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
+if(USE_CUDA)
   # A hack to deal with cuda library dependencies and modern CMake: the
   # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
   # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
@@ -243,31 +249,46 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
 endif()
 
 # ---[ Caffe2 HIP sources.
-if(BUILD_CAFFE2)
-  if(USE_ROCM)
-    HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
-    set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
+if(USE_ROCM)
+  HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES})
 
-    target_include_directories(
-      caffe2_hip PUBLIC ${Caffe2_HIP_INCLUDES})
-    target_include_directories(
-      caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
+  IF(BUILD_ATEN)
+    # Set necessary HIPCC Flags
+    SET(HIP_HCC_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
+    SET(Caffe2_HIP_CXX_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${Caffe2_HIP_CXX_FLAGS}")
+  ENDIF()
 
-    target_link_libraries(caffe2_hip PUBLIC caffe2)
-    target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
+  # Since the HIP_ADD_LIBRARY is a MACRO, we need to set HIP_HCC_FLAGS prior to calling it.
+  # Also, Since HIP_INCLUDE_DIRECTORIES is a MACRO, we must call it before HIP_ADD_LIBRARY.
+  HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
 
-    set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
+  set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
 
-    caffe2_interface_library(caffe2_hip caffe2_hip_library)
-    list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
-    install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
-  endif()
+  target_include_directories(
+    caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES})
+  target_include_directories(
+    caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
+
+  IF(BUILD_ATEN)
+    aten_set_target_props(caffe2_hip)
+  ENDIF()
+
+  target_link_libraries(caffe2_hip PUBLIC caffe2)
+  target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
+
+  # https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine
+  # To avoid having to do above, keep this commented.
+  set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
+
+  caffe2_interface_library(caffe2_hip caffe2_hip_library)
+  list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
+  install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
 endif()
 
 # ---[ Check if warnings should be errors.
 if ($ENV{WERROR})
   target_compile_options(caffe2 PRIVATE -Werror)
-  if(USE_CUDA OR USE_ROCM)
+  if(USE_CUDA)
     target_compile_options(caffe2_gpu PRIVATE -Werror)
   endif()
 endif()
@@ -315,7 +336,8 @@ if(BUILD_CAFFE2)
   # Aten tests should only run when Caffe2 is not built
   set(__aten_test_dir "test/aten")
 endif()
-if(BUILD_ATEN)
+# Todo - Set up ATen tests for ROCm in an upcoming PR
+if(BUILD_ATEN AND NOT USE_ROCM)
   foreach(test_src ${ATen_CPU_TEST_SRCS})
     get_filename_component(test_name ${test_src} NAME_WE)
     add_executable(${test_name} "${test_src}")
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index f722841db07..3846ff30904 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -442,8 +442,8 @@ if(USE_CUDA)
 endif()
 
 # ---[ HIP
-if(BUILD_CAFFE2)
-  include(cmake/public/LoadHIP.cmake)
+if(BUILD_CAFFE2 OR BUILD_ATEN)
+  include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
   if(PYTORCH_FOUND_HIP)
     message(INFO "Compiling with HIP for AMD.")
     caffe2_update_option(USE_ROCM ON)
@@ -452,8 +452,11 @@ if(BUILD_CAFFE2)
     set(Caffe2_HIP_INCLUDES
       ${hip_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${Caffe2_HIP_INCLUDES} ${thrust_INCLUDE_DIRS})
     set(Caffe2_HIP_DEPENDENCY_LIBS
-      ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES})
-
+      ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipblas_LIBRARIES})
+    # Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image)
+    if(BUILD_ATEN)
+      set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES} ${hiprng_LIBRARIES})
+    endif()
     # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES}
     list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
       roc::rocblas)
@@ -475,7 +478,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2)
  EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
 
  # Link with HIPCC https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#linking-with-hipcc
- SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
+ # SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
 
  # Show message that we're using ROCm.
  MESSAGE(STATUS "ROCM TRUE:")
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 7d813cc22de..c6b98143ddc 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -103,11 +103,15 @@ IF(HIP_FOUND)
   set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
   set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
   set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
+  set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas)
+  set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
 
   find_package(rocrand REQUIRED)
   find_package(hiprand REQUIRED)
   find_package(rocblas REQUIRED)
   find_package(miopen REQUIRED)
+  #find_package(hipblas REQUIRED) There's a bug with the CMake file in the Hipblas package.
+  #find_package(hipsparse REQUIRED)
 
   # TODO: hip_hcc has an interface include flag "-hc" which is only
   # recognizable by hcc, but not gcc and clang. Right now in our
@@ -117,6 +121,10 @@ IF(HIP_FOUND)
   # TODO: miopen_LIBRARIES should return fullpath to the library file,
   # however currently it's just the lib name
   FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
+  FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
+  FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib)
+  FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib)
+  FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
 
   set(thrust_INCLUDE_DIRS ${THRUST_PATH} ${THRUST_PATH}/thrust/system/cuda/detail/cub-hip)
 
diff --git a/docker/caffe2/jenkins/build.sh b/docker/caffe2/jenkins/build.sh
index 7531136c1b9..357a79549da 100755
--- a/docker/caffe2/jenkins/build.sh
+++ b/docker/caffe2/jenkins/build.sh
@@ -34,8 +34,10 @@ if [[ "$image" == *cuda* ]]; then
   DOCKERFILE="${OS}-cuda/Dockerfile"
 fi
 
+# TODO: the version number here actually doesn't do anything at the
+# moment
 if [[ "$image" == *rocm* ]]; then
-  ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+.\d+)/ && print $1')"
+  ROCM_VERSION="$(echo "${image}" | perl -n -e'/rocm(\d+\.\d+\.\d+|nightly)/ && print $1')"
   DOCKERFILE="${OS}-rocm/Dockerfile"
 fi
 
diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh
index 5376e2701a7..0dbc382a980 100644
--- a/docker/caffe2/jenkins/common/install_rocm.sh
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@@ -2,8 +2,25 @@
 
 set -ex
 
+# TODO: This script should install a SPECIFIC ROCM_VERSION, but actually
+# it ignores all values of ROCM_VERSION which are not nightly.  Ugh!
 [ -n "$ROCM_VERSION" ]
 
+install_hip_nightly() {
+    git clone https://github.com/ROCm-Developer-Tools/HIP.git
+    pushd HIP
+    export HIP_PLATFORM=hcc
+    yes | ./install.sh --install
+    popd
+    rm -rf HIP
+
+    git clone https://github.com/ROCmSoftwarePlatform/hipBLAS.git
+    pushd hipBLAS
+    yes | ./install.sh --install
+    popd
+    rm -rf hipBLAS
+}
+
 install_ubuntu() {
     apt-get update
     apt-get install -y wget
@@ -36,11 +53,28 @@ install_centos() {
 install_hip_thrust() {
     # Needed for now, will be replaced soon
     git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust
+    pushd /data/Thrust
+    curl https://patch-diff.githubusercontent.com/raw/ROCmSoftwarePlatform/Thrust/pull/12.patch -o 12.patch
+    patch -p1 < 12.patch
+    rm 12.patch
+    popd
     rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip
     git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip
     cd /data/Thrust/thrust/system/cuda/detail/cub-hip && git checkout hip_port_1.7.4_caffe2 && cd -
 }
 
+install_hcrng() {
+    mkdir -p /opt/rocm/debians
+    curl https://s3.amazonaws.com/ossci-linux/hcrng-master-a8c6a0b-Linux.deb -o /opt/rocm/debians/hcrng.deb 
+    dpkg -i /opt/rocm/debians/hcrng.deb
+}
+
+install_hcsparse() {
+    mkdir -p /opt/rocm/debians
+    curl https://s3.amazonaws.com/ossci-linux/hcsparse-master-907a505-Linux.deb -o /opt/rocm/debians/hcsparse.deb
+    dpkg -i /opt/rocm/debians/hcsparse.deb
+}
+
 
 # Install Python packages depending on the base OS
 if [ -f /etc/lsb-release ]; then
@@ -52,4 +86,12 @@ else
   exit 1
 fi
 
+# NB: We first install the "wrong" version, but then use those dev tools
+# to install the newer version of HIP.
+if [ "$ROCM_VERSION" = "nightly" ]; then
+  install_hip_nightly
+fi
+
 install_hip_thrust
+install_hcrng
+install_hcsparse
diff --git a/setup.py b/setup.py
index 00168cd656b..92356404723 100644
--- a/setup.py
+++ b/setup.py
@@ -104,6 +104,7 @@ import importlib
 
 from tools.setup_helpers.env import check_env_flag
 from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME, CUDA_VERSION
+from tools.setup_helpers.rocm import WITH_ROCM, ROCM_HOME, ROCM_VERSION
 from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY,
                                        CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
 from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \
@@ -126,7 +127,6 @@ DEBUG = check_env_flag('DEBUG')
 IS_WINDOWS = (platform.system() == 'Windows')
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
-WITH_ROCM = check_env_flag('WITH_ROCM')
 
 FULL_CAFFE2 = check_env_flag('FULL_CAFFE2')
 BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
@@ -631,8 +631,10 @@ library_dirs.append(lib_path)
 
 # we specify exact lib names to avoid conflict with lua-torch installs
 CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.so')]
-if WITH_CUDA or WITH_ROCM:
+if WITH_CUDA:
     CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_gpu.so'), '-Wl,--as-needed'])
+if WITH_ROCM:
+    CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed'])
 THD_LIB = os.path.join(lib_path, 'libTHD.a')
 NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
 C10D_LIB = os.path.join(lib_path, 'libc10d.a')
@@ -643,14 +645,18 @@ NANOPB_STATIC_LIB = os.path.join(lib_path, 'libprotobuf-nanopb.a')
 
 if IS_DARWIN:
     CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.dylib')]
-    if WITH_CUDA or WITH_ROCM:
+    if WITH_CUDA:
         CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib'))
+    if WITH_ROCM:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib'))
     NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')
 
 if IS_WINDOWS:
     CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')]
-    if WITH_CUDA or WITH_ROCM:
+    if WITH_CUDA:
         CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib'))
+    if WITH_ROCM:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib'))
     if DEBUG:
         NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopbd.lib')
     else:
diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py
index 315220ec4c7..a6ca1ae1b06 100644
--- a/tools/amd_build/build_pytorch_amd.py
+++ b/tools/amd_build/build_pytorch_amd.py
@@ -2,12 +2,12 @@
 import shutil
 import subprocess
 import os
+import sys
 from shutil import copytree, ignore_patterns
 from functools import reduce
 
 amd_build_dir = os.path.dirname(os.path.realpath(__file__))
 proj_dir = os.path.dirname(os.path.dirname(amd_build_dir))
-out_dir = os.path.join(os.path.dirname(proj_dir), "pytorch_amd")
 include_dirs = [
     "aten",
     "torch"
@@ -16,16 +16,13 @@ include_dirs = [
 # List of operators currently disabled
 yaml_file = os.path.join(amd_build_dir, "disabled_features.yaml")
 
-# Create the pytorch_amd directory
-shutil.copytree(proj_dir, out_dir)
-
 # Apply patch files.
 patch_folder = os.path.join(amd_build_dir, "patches")
 for filename in os.listdir(os.path.join(amd_build_dir, "patches")):
-    subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=out_dir)
+    subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=proj_dir)
 
 # HIPCC Compiler doesn't provide host defines - Automatically include them.
-for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")):
+for root, _, files in os.walk(os.path.join(proj_dir, "aten/src/ATen")):
     for filename in files:
         if filename.endswith(".cu") or filename.endswith(".cuh"):
             filepath = os.path.join(root, filename)
@@ -45,7 +42,7 @@ for root, _, files in os.walk(os.path.join(out_dir, "aten/src/ATen")):
 # Make various replacements inside AMD_BUILD/torch directory
 ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp",
                 "csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"]
-for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
+for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
     for filename in files:
         if filename.endswith(".cpp") or filename.endswith(".h"):
             source = os.path.join(root, filename)
@@ -53,7 +50,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
             if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
                 continue
             # Update contents.
-            with open(source, "r+", encoding="utf-8") as f:
+            with open(source, "r+") as f:
                 contents = f.read()
                 contents = contents.replace("WITH_CUDA", "WITH_ROCM")
                 contents = contents.replace("CUDA_VERSION", "0")
@@ -64,9 +61,7 @@ for root, _directories, files in os.walk(os.path.join(out_dir, "torch")):
                 os.fsync(f)
 
 # Execute the Hipify Script.
-subprocess.Popen(
-    ["/opt/rocm/bin/hipify-python.py",
-        "--project-directory", proj_dir,
-        "--output-directory", out_dir,
-        "--include-dirs"] + include_dirs +
-    ["--yaml-settings", yaml_file, "--add-static-casts", "True"])
+args = ["--project-directory", proj_dir,
+        "--output-directory", proj_dir,
+        "--include-dirs"] + include_dirs + ["--yaml-settings", yaml_file, "--add-static-casts", "True"]
+os.execv("/opt/rocm/bin/hipify-python.py", ['python'] + args)
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index 60fd665e08d..ca82ab926c6 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -1,5 +1,5 @@
 {
-  "disabled_hip_function_calls":
+  "disable_unsupported_hip_calls":
     [
       {
         "path": "aten/src/THC/generic/THCTensorSort.cu",
@@ -72,11 +72,16 @@
           "#include <nvfunctional>": ""
         }
       },
+      {
+        "path": "aten/src/ATen/native/cuda/Distributions.cu",
+        "s_constants": {
+          "#include <nvfunctional>": ""
+        }
+      },
       {
         "path": "aten/src/THC/THCNumerics.cuh",
         "s_constants": {
           "#ifdef __CUDA_ARCH__": "#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)",
-          "labs": "fabs",
           "#if CUDA_VERSION < 9000": "#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)"
         }
       },
@@ -129,10 +134,16 @@
   "disabled_modules": [
     "aten/src/ATen/native/cuda/CuFFTUtils.h",
     "aten/src/ATen/native/cuda/SpectralOps.cu",
-    "aten/src/ATen/native/cuda/Distributions.cu",
-    "aten/src/THCUNN/RReLU.cu"
+    "aten/src/THCUNN/RReLU.cu",
+    "aten/src/ATen/native/cuda/Distributions.cu"
   ],
   "disabled_functions": [
+    {
+      "path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh",
+      "functions": [
+        "kernelPointwiseApply4"
+      ]
+    },
     {
       "path": "aten/src/ATen/cuda/detail/IndexUtils.cu",
       "non_device_functions": [
@@ -148,7 +159,9 @@
     {
       "path": "aten/src/ATen/native/cuda/Distributions.cu",
       "functions": [
-        "_s_poisson_cuda"
+        "_s_poisson_cuda",
+        "poisson_cuda_kernel",
+        "gamma_cuda_kernel"
       ]
     },
     {
diff --git a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch b/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch
deleted file mode 100644
index d1bcb76bb58..00000000000
--- a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch
+++ /dev/null
@@ -1,23 +0,0 @@
-diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-index 2653fed0b..c4e9089e0 100644
---- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-@@ -19,13 +19,17 @@ inline __host__ __device__ float fmaxType(float x, half y) {
- }
- #endif
- 
-+/* In ROCm we have a conversion from half to __fp16, and then there's a
-+conversion operator from __fp16 to double (w/ the standard conversion
-+double to float), so comment out these two lines to prevent ambiguous calls
-+for fmaxType when half is passed in.
- inline __host__ __device__ float fmaxType(float x, float y) {
-   return fmaxf(x, y);
- }
- 
- inline __host__ __device__ double fmaxType(double x, double y) {
-   return fmax(x, y);
--}
-+}*/
- 
- #ifdef CUDA_HALF_TENSOR
- 
diff --git a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch b/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch
deleted file mode 100644
index 72e4160ea4a..00000000000
--- a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch
+++ /dev/null
@@ -1,39 +0,0 @@
-diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh
-index ca6bf7cbe..a523648f1 100644
---- a/aten/src/THC/THCTensorMathReduce.cuh
-+++ b/aten/src/THC/THCTensorMathReduce.cuh
-@@ -105,7 +105,7 @@ struct SquareFunctor<ResT, half> {
- template <typename T>
- struct ReduceMin {
-   inline __device__ T operator()(T a, T b) const {
--    return (THCNumerics<T>::lt(a, b) ||
-+    return ((int)THCNumerics<T>::sub(a, b) < 0 ||
-             THCNumerics<T>::isnan(a)) ? a : b;
-   }
- };
-@@ -113,7 +113,7 @@ struct ReduceMin {
- template <typename T>
- struct ReduceMax {
-   inline __device__ T operator()(T a, T b) const {
--    return (THCNumerics<T>::gt(a, b) ||
-+    return ((int)THCNumerics<T>::sub(a, b) > 0 ||
-             THCNumerics<T>::isnan(a)) ? a : b;
-   }
- };
-@@ -167,7 +167,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
- 
-   buffer[tx] = ScalarConvert<int, Real>::to(0);
-   Real norm;
--
-+  #if !defined(__HIP_DEVICE_COMPILE__)
-   if (THCNumerics<Real>::eq(value, ScalarConvert<float, Real>::to(INFINITY))) {
-     // get norm of axis
-     for (ptrdiff_t i=tx; i<size; i+=step)
-@@ -225,6 +225,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
-       row[i] = THCNumerics<Real>::mul(row[i], norm);
-     }
-   }
-+  #endif
- }
- 
- template <typename T>
diff --git a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch
index 8fbc7ad7e90..355c1e3a7c3 100644
--- a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch
+++ b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch
@@ -1,5 +1,5 @@
 diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
-index 906780b4f..c99f156ab 100644
+index 906780b4f..b03e051cb 100644
 --- a/aten/src/THC/generic/THCTensorRandom.cu
 +++ b/aten/src/THC/generic/THCTensorRandom.cu
 @@ -504,11 +504,11 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_
@@ -7,7 +7,7 @@ index 906780b4f..c99f156ab 100644
    if (range > 1ULL << 32) {
      generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
 -        gen->state.gen_states, size, data, min_val, range);
-+        gen->state.gen_states, static_cast<int>(size), data, static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(min_val), static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(range));
++        gen->state.gen_states, static_cast<int>(size), data, min_val, range);
    } else {
  #endif
      generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -17,7 +17,7 @@ index 906780b4f..c99f156ab 100644
    }
  #endif
 @@ -534,19 +534,19 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
-
+ 
  #if defined(THC_REAL_IS_HALF)
    generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
 -      gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1);
@@ -39,5 +39,5 @@ index 906780b4f..c99f156ab 100644
 -      gen->state.gen_states, size, data, 0UL, static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
 +      gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(0UL), static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
  #endif
-
+ 
    THCTensor_(freeCopyTo)(state, self, self_);
diff --git a/tools/setup_helpers/rocm.py b/tools/setup_helpers/rocm.py
new file mode 100644
index 00000000000..b981dd82dc5
--- /dev/null
+++ b/tools/setup_helpers/rocm.py
@@ -0,0 +1,5 @@
+from .env import check_env_flag
+# Check if ROCM is enabled
+WITH_ROCM = check_env_flag('WITH_ROCM')
+ROCM_HOME = "/opt/rocm"
+ROCM_VERSION = ""