RIP CUDA <9.2: circleci, aten, and caffe2 (#36846)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/36846 Test Plan: Imported from OSS Differential Revision: D21620850 Pulled By: ngimel fbshipit-source-id: 7ad1676a12f86250f301095ffc6f365a3b370f34
2026-05-14 20:57:59 +00:00 · 2020-05-18 13:36:16 -07:00 · 2020-05-18 13:36:16 -07:00 · 5e2d8745c8
commit 5e2d8745c8
parent b29e7f9b9d
12 changed files with 20 additions and 118 deletions
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@ -76,14 +76,6 @@ case "$image" in
    DB=yes
    VISION=yes
    ;;
-  pytorch-linux-xenial-cuda9-cudnn7-py3)
-    CUDA_VERSION=9.0
-    CUDNN_VERSION=7
-    ANACONDA_PYTHON_VERSION=3.6
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ;;
  pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4)
    CUDA_VERSION=9.2
    CUDNN_VERSION=7
--- a/.circleci/docker/common/install_conda.sh
+++ b/.circleci/docker/common/install_conda.sh
@ -65,11 +65,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # DO NOT install cmake here as it would install a version newer than 3.5, but
  # we want to pin to version 3.5.
  conda_install numpy pyyaml mkl mkl-include setuptools cffi typing future six
-  if [[ "$CUDA_VERSION" == 9.0* ]]; then
-    conda_install magma-cuda90 -c pytorch
-  elif [[ "$CUDA_VERSION" == 9.1* ]]; then
-    conda_install magma-cuda91 -c pytorch
-  elif [[ "$CUDA_VERSION" == 9.2* ]]; then
+  if [[ "$CUDA_VERSION" == 9.2* ]]; then
    conda_install magma-cuda92 -c pytorch
  elif [[ "$CUDA_VERSION" == 10.0* ]]; then
    conda_install magma-cuda100 -c pytorch
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@ -72,26 +72,6 @@ Tensor prepare_matrix_for_cublas(Tensor& tensor, bool& transpose_tensor) {
  return tensor_;
 }

-// Check https://github.com/pytorch/pytorch/issues/22078
-// for information about the bug. We don't know the exact conditions that trigger it,
-// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a
-// necessary condition.
-static void checkCuda90Bug(int i_m, int i_n, int i_k)
-{
-#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000
-  static std::once_flag alreadyWarned;
-  const int LIMIT = 1 << 21;
-  if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) {
-    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-    if (prop->major == 5 || prop->major == 6) {
-      std::call_once(alreadyWarned, []() {
-        TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later.");
-      });
-    }
-  }
-#endif
-}
-
 Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
  TORCH_CHECK(
    (mat1.dim() == 2) && (mat2.dim() == 2) &&
@ -143,9 +123,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  at::ScalarType scalar_type = self.scalar_type();

  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
-    if (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::Float) {
-      checkCuda90Bug(static_cast<int>(m), static_cast<int>(n), static_cast<int>(k));
-    }
    scalar_t alpha_val = alpha.to<scalar_t>();
    scalar_t beta_val = beta.to<scalar_t>();
    scalar_t* mat1_ptr = mat1_.data_ptr<scalar_t>();
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -578,9 +578,6 @@ namespace {
  }

  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){
-#if CUDNN_VERSION < 7200 || CUDA_VERSION < 9010
-      return CUDNN_RNN_ALGO_STANDARD;
-#else
      cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
      const int64_t bsize = tensors.mini_batch;
      //excluding Turing from using persistent rnn.
@ -599,17 +596,12 @@ namespace {
          }
      }
      return CUDNN_RNN_ALGO_STANDARD;
-#endif
  }

  cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-#if CUDNN_VERSION != 7103
-// CUDNN 7.1.3 enforces RNN descriptor type to be identical to input/weight. This check throws an error for type
-// promotion. The check has since been removed.
    if (dtype == CUDNN_DATA_HALF) {
      return CUDNN_DATA_FLOAT;
    }
-#endif
    return dtype;
  }

--- a/caffe2/core/common_gpu.cc
+++ b/caffe2/core/common_gpu.cc
@ -215,15 +215,10 @@ bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
 }

 bool TensorCoreAvailable() {
-  // requires CUDA 9.0 and above
-#if CUDA_VERSION < 9000
-  return false;
-#else
  int device = CaffeCudaGetDevice();
  auto& prop = GetDeviceProperty(device);

  return prop.major >= 7;
-#endif
 }

 const char* cublasGetErrorString(cublasStatus_t error) {
@ -238,23 +233,18 @@ const char* cublasGetErrorString(cublasStatus_t error) {
    return "CUBLAS_STATUS_INVALID_VALUE";
  case CUBLAS_STATUS_ARCH_MISMATCH:
    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
 #ifndef __HIP_PLATFORM_HCC__
  case CUBLAS_STATUS_MAPPING_ERROR:
    return "CUBLAS_STATUS_MAPPING_ERROR";
  case CUBLAS_STATUS_EXECUTION_FAILED:
    return "CUBLAS_STATUS_EXECUTION_FAILED";
-#endif
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-#if CUDA_VERSION >= 6000
  case CUBLAS_STATUS_NOT_SUPPORTED:
    return "CUBLAS_STATUS_NOT_SUPPORTED";
-#if CUDA_VERSION >= 6050
  case CUBLAS_STATUS_LICENSE_ERROR:
    return "CUBLAS_STATUS_LICENSE_ERROR";
-#endif  // CUDA_VERSION >= 6050
-#endif  // CUDA_VERSION >= 6000
-#ifdef __HIP_PLATFORM_HCC__
+#else
  case rocblas_status_invalid_size:
    return "rocblas_status_invalid_size";
  case rocblas_status_perf_degraded:
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@ -5,18 +5,14 @@
 #include <cuda.h>
 #include <cuda_runtime.h>

-// Disable strict aliasing errors for CUDA 9.
-// The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic.
-// It is included by cusparse.h as well, so guarding the
-// inclusion of that header here is not enough.
-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
 #ifdef __GNUC__
 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
 #pragma GCC diagnostic push
 #endif
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif // __GNUC__
-#endif // CUDA_VERSION >= 9000
+#endif // __HIP_PLATFORM_HCC__

 #include <cublas_v2.h>
 #include <curand.h>
@ -42,9 +38,7 @@
 // CAFFE_HAS_CUDA_FP16 manually.

 #ifndef CAFFE_HAS_CUDA_FP16
-#if CUDA_VERSION >= 7050 || defined(__HIP_PLATFORM_HCC__)
 #define CAFFE_HAS_CUDA_FP16
-#endif // CUDA_VERSION >= 7050
 #endif // CAFFE_HAS_CUDA_FP16

 #ifdef CAFFE_HAS_CUDA_FP16
@ -59,13 +53,13 @@ constexpr int kFp16CUDADevicePropMajor = 3;
 #endif

 // Re-enable strict aliasing diagnostic if it was disabled.
-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
 #ifdef __GNUC__
 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
 #pragma GCC diagnostic pop
 #endif
 #endif // __GNUC__
-#endif // CUDA_VERSION >= 9000
+#endif // __HIP_PLATFORM_HCC__

 /**
 * The maximum number of peers that each gpu can have when doing p2p setup.
@ -78,12 +72,12 @@ constexpr int kFp16CUDADevicePropMajor = 3;

 namespace caffe2 {

-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
 /**
 * Empty class to identify TensorCore-based math
 */
 class TensorCoreEngine {};
-#endif
+#endif // __HIP_PLATFORM_HCC__

 #if CUDA_VERSION >= 10000
 #define CAFFE2_CUDA_PTRATTR_MEMTYPE type
--- a/caffe2/operators/batch_matmul_op.cu
+++ b/caffe2/operators/batch_matmul_op.cu
@ -11,7 +11,8 @@ bool BatchMatMulOp<CUDAContext, DefaultEngine>::RunOnDevice() {

 REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp<CUDAContext>);

-#if CUDA_VERSION >= 9000
+
+#ifndef __HIP_PLATFORM_HCC__

 template <>
 bool BatchMatMulOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
--- a/caffe2/operators/fully_connected_op_gpu.cc
+++ b/caffe2/operators/fully_connected_op_gpu.cc
@ -138,7 +138,8 @@ bool FullyConnectedGradientOp<
  return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
 }

-#if CUDA_VERSION >= 9000
+
+#ifndef __HIP_PLATFORM_HCC__

 // Require these to be defined otherwise TensorCore FC ops will end
 // up calling the default FC implementation which doesn't have
@ -190,7 +191,8 @@ REGISTER_CUDA_OPERATOR(
        DefaultEngine,
        false /* don't transpose weight */>);

-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__
+
 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
    FC,
    TENSORCORE,
@ -214,6 +216,7 @@ REGISTER_CUDA_OPERATOR_WITH_ENGINE(
        CUDAContext,
        TensorCoreEngine,
        false /* don't transpose weight */>);
+
 #endif

 } // namespace caffe2
--- a/caffe2/operators/top_k_heap_selection.cuh
+++ b/caffe2/operators/top_k_heap_selection.cuh
@ -127,11 +127,7 @@ warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) {
  int index = __popcll(getLaneMaskLt() & vote);
  int total = __popcll(vote);
 #else
-#if CUDA_VERSION >= 9000
  unsigned int vote = __ballot_sync(__activemask(), wantInsert);
-#else
-  unsigned int vote = __ballot(wantInsert);
-#endif

  if (!vote) {
    // Everything the warp has is smaller than our heap
--- a/caffe2/operators/top_k_radix_selection.cuh
+++ b/caffe2/operators/top_k_radix_selection.cuh
@ -171,11 +171,7 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize],
 #if defined(__HIP_PLATFORM_HCC__)
      counts[j] += __popcll(__ballot(vote));
 #else
-#if CUDA_VERSION >= 9000
      counts[j] += __popc(__ballot_sync(__activemask(), vote));
-#else
-      counts[j] += __popc(__ballot(vote));
-#endif
 #endif  // __HIP_PLATFORM_HCC__
    }
  }
--- a/caffe2/utils/GpuScanUtils.cuh
+++ b/caffe2/utils/GpuScanUtils.cuh
@ -68,12 +68,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti
  T index = __popcll(getLaneMaskLe() & vote);
  T carry = __popcll(vote);
 #else
-#if CUDA_VERSION >= 9000
  T vote = __ballot_sync(__activemask(), in);
-#else
-  T vote = __ballot(in);
-#endif  // CUDA_VERSION
-
  T index = __popc(getLaneMaskLe() & vote);
  T carry = __popc(vote);
 #endif  // __HIP_PLATFORM_HCC__
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@ -865,24 +865,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
  const cublasOperation_t cu_trans_B =
      (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  if (math_type == TensorProto_DataType_FLOAT) {
-#if CUDA_VERSION < 9010
-    // loop over matrices in the batch
-    for (int i = 0; i < batch_size; ++i) {
-      Gemm<at::Half, CUDAContext>(
-          trans_A,
-          trans_B,
-          M,
-          N,
-          K,
-          alpha,
-          A[i],
-          B[i],
-          beta,
-          C[i],
-          context,
-          math_type);
-    }
-#else
    thrust::device_vector<const void*> A_device(A, A + batch_size);
    thrust::device_vector<const void*> B_device(B, B + batch_size);
    thrust::device_vector<void*> C_device(C, C + batch_size);
@ -909,7 +891,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched<at::Half, CUDAContext>(
        batch_size,
        CUDA_R_32F,
        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-#endif
  } else if (math_type == TensorProto_DataType_FLOAT16) {
    // Convert alpha, beta from float -> __half
    const __half alpha_fp16 = at::Half(alpha);
@ -989,16 +970,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
  const cublasOperation_t cu_trans_B =
      (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  if (math_type == TensorProto_DataType_FLOAT) {
-#if CUDA_VERSION < 9010 && !defined(__HIP_PLATFORM_HCC__)
-    // loop over matrices in the batch
-    for (int i = 0; i < batch_size; ++i) {
-      Gemm<at::Half, CUDAContext>(
-          trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context, math_type);
-      A += A_stride;
-      B += B_stride;
-      C += C_stride;
-    }
-#else
    CUBLAS_ENFORCE(cublasSetPointerMode(
        context->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
 #ifdef __HIP_PLATFORM_HCC__
@ -1060,7 +1031,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
        CUDA_R_32F,
        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 #endif // __HIP_PLATFORM_HCC__
-#endif
  } else if (math_type == TensorProto_DataType_FLOAT16) {
    // Convert alpha, beta from float -> __half
    const __half alpha_fp16 = at::Half(alpha);
@ -1223,7 +1193,7 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext>(
  }
 }

-#if CUDA_VERSION >= 9000
+#ifndef __HIP_PLATFORM_HCC__

 // No change, but required. Defer to default CUDA engine
 template <>
@ -1473,7 +1443,7 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext, TensorCoreEngine>(
      trans_A, M, N, alpha, A, x, beta, y, context, math_type);
 }

-#endif // CUDA_VERSION >= 9000
+#endif

 template <>
 CAFFE2_CUDA_EXPORT void GemmEx<float, CUDAContext>(
@ -1692,7 +1662,7 @@ CAFFE2_CUDA_EXPORT void Dot<at::Half, CUDAContext>(
    CUDAContext* context) {
 #if defined __HIP_PLATFORM_HCC__ && HIP_VERSION < 210
  CAFFE_THROW("HIP currently does not support FP16 completely yet.");
-#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210  
+#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210
  CUBLAS_ENFORCE(cublasSetPointerMode(
      context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));
  CUBLAS_ENFORCE(rocblas_hdot(