diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 6dc249f8937..58dc91b9c9e 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -76,14 +76,6 @@ case "$image" in
     DB=yes
     VISION=yes
     ;;
-  pytorch-linux-xenial-cuda9-cudnn7-py3)
-    CUDA_VERSION=9.0
-    CUDNN_VERSION=7
-    ANACONDA_PYTHON_VERSION=3.6
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ;;
   pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4)
     CUDA_VERSION=9.2
     CUDNN_VERSION=7
diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh
index fa90919279e..80c85af5419 100755
--- a/.circleci/docker/common/install_conda.sh
+++ b/.circleci/docker/common/install_conda.sh
@@ -65,11 +65,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   # DO NOT install cmake here as it would install a version newer than 3.5, but
   # we want to pin to version 3.5.
   conda_install numpy pyyaml mkl mkl-include setuptools cffi typing future six
-  if [[ "$CUDA_VERSION" == 9.0* ]]; then
-    conda_install magma-cuda90 -c pytorch
-  elif [[ "$CUDA_VERSION" == 9.1* ]]; then
-    conda_install magma-cuda91 -c pytorch
-  elif [[ "$CUDA_VERSION" == 9.2* ]]; then
+  if [[ "$CUDA_VERSION" == 9.2* ]]; then
     conda_install magma-cuda92 -c pytorch
   elif [[ "$CUDA_VERSION" == 10.0* ]]; then
     conda_install magma-cuda100 -c pytorch
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index 11355531f7b..24deb22b460 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -72,26 +72,6 @@ Tensor prepare_matrix_for_cublas(Tensor& tensor, bool& transpose_tensor) {
   return tensor_;
 }
 
-// Check https://github.com/pytorch/pytorch/issues/22078
-// for information about the bug. We don't know the exact conditions that trigger it,
-// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a
-// necessary condition.
-static void checkCuda90Bug(int i_m, int i_n, int i_k)
-{
-#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000
-  static std::once_flag alreadyWarned;
-  const int LIMIT = 1 << 21;
-  if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) {
-    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-    if (prop->major == 5 || prop->major == 6) {
-      std::call_once(alreadyWarned, []() {
-        TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later.");
-      });
-    }
-  }
-#endif
-}
-
 Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
   TORCH_CHECK(
     (mat1.dim() == 2) && (mat2.dim() == 2) &&
@@ -143,9 +123,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   at::ScalarType scalar_type = self.scalar_type();
 
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
-    if (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::Float) {
-      checkCuda90Bug(static_cast<int>(m), static_cast<int>(n), static_cast<int>(k));
-    }
     scalar_t alpha_val = alpha.to<scalar_t>();
     scalar_t beta_val = beta.to<scalar_t>();
     scalar_t* mat1_ptr = mat1_.data_ptr<scalar_t>();
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 87e08b7cf5e..3ae595ebb7b 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -578,9 +578,6 @@ namespace {
   }
 
   cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){
-#if CUDNN_VERSION < 7200 || CUDA_VERSION < 9010
-      return CUDNN_RNN_ALGO_STANDARD;
-#else
       cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
       const int64_t bsize = tensors.mini_batch;
       //excluding Turing from using persistent rnn.
@@ -599,17 +596,12 @@ namespace {
           }
       }
       return CUDNN_RNN_ALGO_STANDARD;
-#endif
   }
 
   cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-#if CUDNN_VERSION != 7103
-// CUDNN 7.1.3 enforces RNN descriptor type to be identical to input/weight. This check throws an error for type
-// promotion. The check has since been removed.
     if (dtype == CUDNN_DATA_HALF) {
       return CUDNN_DATA_FLOAT;
     }
-#endif
     return dtype;
   }
 
diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc
index 2999faf5ebc..b0f4cec7223 100644
--- a/caffe2/core/common_gpu.cc
+++ b/caffe2/core/common_gpu.cc
@@ -215,15 +215,10 @@ bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
 }
 
 bool TensorCoreAvailable() {
-  // requires CUDA 9.0 and above
-#if CUDA_VERSION < 9000
-  return false;
-#else
   int device = CaffeCudaGetDevice();
   auto& prop = GetDeviceProperty(device);
 
   return prop.major >= 7;
-#endif
 }
 
 const char* cublasGetErrorString(cublasStatus_t error) {
@@ -238,23 +233,18 @@ const char* cublasGetErrorString(cublasStatus_t error) {
     return "CUBLAS_STATUS_INVALID_VALUE";
   case CUBLAS_STATUS_ARCH_MISMATCH:
     return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
 #ifndef __HIP_PLATFORM_HCC__
   case CUBLAS_STATUS_MAPPING_ERROR:
     return "CUBLAS_STATUS_MAPPING_ERROR";
   case CUBLAS_STATUS_EXECUTION_FAILED:
     return "CUBLAS_STATUS_EXECUTION_FAILED";
-#endif
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-#if CUDA_VERSION >= 6000
   case CUBLAS_STATUS_NOT_SUPPORTED:
     return "CUBLAS_STATUS_NOT_SUPPORTED";
-#if CUDA_VERSION >= 6050
   case CUBLAS_STATUS_LICENSE_ERROR:
     return "CUBLAS_STATUS_LICENSE_ERROR";
-#endif  // CUDA_VERSION >= 6050
-#endif  // CUDA_VERSION >= 6000
-#ifdef __HIP_PLATFORM_HCC__
+#else
   case rocblas_status_invalid_size:
     return "rocblas_status_invalid_size";
   case rocblas_status_perf_degraded:
diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
index 4b9b09dde4a..5a070f96081 100644
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@@ -5,18 +5,14 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-// Disable strict aliasing errors for CUDA 9.
-// The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic.
-// It is included by cusparse.h as well, so guarding the
-// inclusion of that header here is not enough.
-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
 #ifdef __GNUC__
 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
 #pragma GCC diagnostic push
 #endif
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif // __GNUC__
-#endif // CUDA_VERSION >= 9000
+#endif // __HIP_PLATFORM_HCC__
 
 #include <cublas_v2.h>
 #include <curand.h>
@@ -42,9 +38,7 @@
 // CAFFE_HAS_CUDA_FP16 manually.
 
 #ifndef CAFFE_HAS_CUDA_FP16
-#if CUDA_VERSION >= 7050 || defined(__HIP_PLATFORM_HCC__)
 #define CAFFE_HAS_CUDA_FP16
-#endif // CUDA_VERSION >= 7050
 #endif // CAFFE_HAS_CUDA_FP16
 
 #ifdef CAFFE_HAS_CUDA_FP16
@@ -59,13 +53,13 @@ constexpr int kFp16CUDADevicePropMajor = 3;
 #endif
 
 // Re-enable strict aliasing diagnostic if it was disabled.
-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
 #ifdef __GNUC__
 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
 #pragma GCC diagnostic pop
 #endif
 #endif // __GNUC__
-#endif // CUDA_VERSION >= 9000
+#endif // __HIP_PLATFORM_HCC__
 
 /**
  * The maximum number of peers that each gpu can have when doing p2p setup.
@@ -78,12 +72,12 @@ constexpr int kFp16CUDADevicePropMajor = 3;
 
 namespace caffe2 {
 
-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
 /**
  * Empty class to identify TensorCore-based math
  */
 class TensorCoreEngine {};
-#endif
+#endif // __HIP_PLATFORM_HCC__
 
 #if CUDA_VERSION >= 10000
 #define CAFFE2_CUDA_PTRATTR_MEMTYPE type
diff --git a/caffe2/operators/batch_matmul_op.cu b/caffe2/operators/batch_matmul_op.cu
index b0ce9a31a16..7bcaac97a15 100644
--- a/caffe2/operators/batch_matmul_op.cu
+++ b/caffe2/operators/batch_matmul_op.cu
@@ -11,7 +11,8 @@ bool BatchMatMulOp<CUDAContext, DefaultEngine>::RunOnDevice() {
 
 REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp<CUDAContext>);
 
-#if CUDA_VERSION >= 9000
+
+#ifndef __HIP_PLATFORM_HCC__
 
 template <>
 bool BatchMatMulOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
diff --git a/caffe2/operators/fully_connected_op_gpu.cc b/caffe2/operators/fully_connected_op_gpu.cc
index 4762692ee28..096e302737a 100644
--- a/caffe2/operators/fully_connected_op_gpu.cc
+++ b/caffe2/operators/fully_connected_op_gpu.cc
@@ -138,7 +138,8 @@ bool FullyConnectedGradientOp<
   return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
 }
 
-#if CUDA_VERSION >= 9000
+
+#ifndef __HIP_PLATFORM_HCC__
 
 // Require these to be defined otherwise TensorCore FC ops will end
 // up calling the default FC implementation which doesn't have
@@ -190,7 +191,8 @@ REGISTER_CUDA_OPERATOR(
         DefaultEngine,
         false /* don't transpose weight */>);
 
-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
+
 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
     FC,
     TENSORCORE,
@@ -214,6 +216,7 @@ REGISTER_CUDA_OPERATOR_WITH_ENGINE(
         CUDAContext,
         TensorCoreEngine,
         false /* don't transpose weight */>);
+
 #endif
 
 } // namespace caffe2
diff --git a/caffe2/operators/top_k_heap_selection.cuh b/caffe2/operators/top_k_heap_selection.cuh
index 921266c0c9e..e6bb1226d58 100644
--- a/caffe2/operators/top_k_heap_selection.cuh
+++ b/caffe2/operators/top_k_heap_selection.cuh
@@ -127,11 +127,7 @@ warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) {
   int index = __popcll(getLaneMaskLt() & vote);
   int total = __popcll(vote);
 #else
-#if CUDA_VERSION >= 9000
   unsigned int vote = __ballot_sync(__activemask(), wantInsert);
-#else
-  unsigned int vote = __ballot(wantInsert);
-#endif
 
   if (!vote) {
     // Everything the warp has is smaller than our heap
diff --git a/caffe2/operators/top_k_radix_selection.cuh b/caffe2/operators/top_k_radix_selection.cuh
index adc9ff141c9..948d8577bee 100644
--- a/caffe2/operators/top_k_radix_selection.cuh
+++ b/caffe2/operators/top_k_radix_selection.cuh
@@ -171,11 +171,7 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize],
 #if defined(__HIP_PLATFORM_HCC__)
       counts[j] += __popcll(__ballot(vote));
 #else
-#if CUDA_VERSION >= 9000
       counts[j] += __popc(__ballot_sync(__activemask(), vote));
-#else
-      counts[j] += __popc(__ballot(vote));
-#endif
 #endif  // __HIP_PLATFORM_HCC__
     }
   }
diff --git a/caffe2/utils/GpuScanUtils.cuh b/caffe2/utils/GpuScanUtils.cuh
index 24ae38c0765..af257765197 100644
--- a/caffe2/utils/GpuScanUtils.cuh
+++ b/caffe2/utils/GpuScanUtils.cuh
@@ -68,12 +68,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti
   T index = __popcll(getLaneMaskLe() & vote);
   T carry = __popcll(vote);
 #else
-#if CUDA_VERSION >= 9000
   T vote = __ballot_sync(__activemask(), in);
-#else
-  T vote = __ballot(in);
-#endif  // CUDA_VERSION
-
   T index = __popc(getLaneMaskLe() & vote);
   T carry = __popc(vote);
 #endif  // __HIP_PLATFORM_HCC__
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 9d148046a19..bf67054349c 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -865,24 +865,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
   const cublasOperation_t cu_trans_B =
       (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   if (math_type == TensorProto_DataType_FLOAT) {
-#if CUDA_VERSION < 9010
-    // loop over matrices in the batch
-    for (int i = 0; i < batch_size; ++i) {
-      Gemm<at::Half, CUDAContext>(
-          trans_A,
-          trans_B,
-          M,
-          N,
-          K,
-          alpha,
-          A[i],
-          B[i],
-          beta,
-          C[i],
-          context,
-          math_type);
-    }
-#else
     thrust::device_vector<const void*> A_device(A, A + batch_size);
     thrust::device_vector<const void*> B_device(B, B + batch_size);
     thrust::device_vector<void*> C_device(C, C + batch_size);
@@ -909,7 +891,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
         batch_size,
         CUDA_R_32F,
         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-#endif
   } else if (math_type == TensorProto_DataType_FLOAT16) {
     // Convert alpha, beta from float -> __half
     const __half alpha_fp16 = at::Half(alpha);
@@ -989,16 +970,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
   const cublasOperation_t cu_trans_B =
       (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   if (math_type == TensorProto_DataType_FLOAT) {
-#if CUDA_VERSION < 9010 && !defined(__HIP_PLATFORM_HCC__)
-    // loop over matrices in the batch
-    for (int i = 0; i < batch_size; ++i) {
-      Gemm<at::Half, CUDAContext>(
-          trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context, math_type);
-      A += A_stride;
-      B += B_stride;
-      C += C_stride;
-    }
-#else
     CUBLAS_ENFORCE(cublasSetPointerMode(
         context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
 #ifdef __HIP_PLATFORM_HCC__
@@ -1060,7 +1031,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
         CUDA_R_32F,
         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 #endif // __HIP_PLATFORM_HCC__
-#endif
   } else if (math_type == TensorProto_DataType_FLOAT16) {
     // Convert alpha, beta from float -> __half
     const __half alpha_fp16 = at::Half(alpha);
@@ -1223,7 +1193,7 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext>(
   }
 }
 
-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
 
 // No change, but required. Defer to default CUDA engine
 template <>
@@ -1473,7 +1443,7 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext, TensorCoreEngine>(
       trans_A, M, N, alpha, A, x, beta, y, context, math_type);
 }
 
-#endif // CUDA_VERSION >= 9000
+#endif
 
 template <>
 CAFFE2_CUDA_EXPORT void GemmEx<float, CUDAContext>(
@@ -1692,7 +1662,7 @@ CAFFE2_CUDA_EXPORT void Dot<at::Half, CUDAContext>(
     CUDAContext* context) {
 #if defined __HIP_PLATFORM_HCC__ && HIP_VERSION < 210
   CAFFE_THROW("HIP currently does not support FP16 completely yet.");
-#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210  
+#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210
   CUBLAS_ENFORCE(cublasSetPointerMode(
       context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));
   CUBLAS_ENFORCE(rocblas_hdot(