diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index 6dc249f8937..58dc91b9c9e 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -76,14 +76,6 @@ case "$image" in DB=yes VISION=yes ;; - pytorch-linux-xenial-cuda9-cudnn7-py3) - CUDA_VERSION=9.0 - CUDNN_VERSION=7 - ANACONDA_PYTHON_VERSION=3.6 - PROTOBUF=yes - DB=yes - VISION=yes - ;; pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4) CUDA_VERSION=9.2 CUDNN_VERSION=7 diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh index fa90919279e..80c85af5419 100755 --- a/.circleci/docker/common/install_conda.sh +++ b/.circleci/docker/common/install_conda.sh @@ -65,11 +65,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then # DO NOT install cmake here as it would install a version newer than 3.5, but # we want to pin to version 3.5. conda_install numpy pyyaml mkl mkl-include setuptools cffi typing future six - if [[ "$CUDA_VERSION" == 9.0* ]]; then - conda_install magma-cuda90 -c pytorch - elif [[ "$CUDA_VERSION" == 9.1* ]]; then - conda_install magma-cuda91 -c pytorch - elif [[ "$CUDA_VERSION" == 9.2* ]]; then + if [[ "$CUDA_VERSION" == 9.2* ]]; then conda_install magma-cuda92 -c pytorch elif [[ "$CUDA_VERSION" == 10.0* ]]; then conda_install magma-cuda100 -c pytorch diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu index 11355531f7b..24deb22b460 100644 --- a/aten/src/ATen/native/cuda/LinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu @@ -72,26 +72,6 @@ Tensor prepare_matrix_for_cublas(Tensor& tensor, bool& transpose_tensor) { return tensor_; } -// Check https://github.com/pytorch/pytorch/issues/22078 -// for information about the bug. We don't know the exact conditions that trigger it, -// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a -// necessary condition. -static void checkCuda90Bug(int i_m, int i_n, int i_k) -{ -#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000 - static std::once_flag alreadyWarned; - const int LIMIT = 1 << 21; - if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) { - cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); - if (prop->major == 5 || prop->major == 6) { - std::call_once(alreadyWarned, []() { - TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later."); - }); - } - } -#endif -} - Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { TORCH_CHECK( (mat1.dim() == 2) && (mat2.dim() == 2) && @@ -143,9 +123,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma at::ScalarType scalar_type = self.scalar_type(); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] { - if (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::Float) { - checkCuda90Bug(static_cast(m), static_cast(n), static_cast(k)); - } scalar_t alpha_val = alpha.to(); scalar_t beta_val = beta.to(); scalar_t* mat1_ptr = mat1_.data_ptr(); diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 87e08b7cf5e..3ae595ebb7b 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -578,9 +578,6 @@ namespace { } cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){ -#if CUDNN_VERSION < 7200 || CUDA_VERSION < 9010 - return CUDNN_RNN_ALGO_STANDARD; -#else cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); const int64_t bsize = tensors.mini_batch; //excluding Turing from using persistent rnn. @@ -599,17 +596,12 @@ namespace { } } return CUDNN_RNN_ALGO_STANDARD; -#endif } cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) { -#if CUDNN_VERSION != 7103 -// CUDNN 7.1.3 enforces RNN descriptor type to be identical to input/weight. This check throws an error for type -// promotion. The check has since been removed. if (dtype == CUDNN_DATA_HALF) { return CUDNN_DATA_FLOAT; } -#endif return dtype; } diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc index 2999faf5ebc..b0f4cec7223 100644 --- a/caffe2/core/common_gpu.cc +++ b/caffe2/core/common_gpu.cc @@ -215,15 +215,10 @@ bool GetCudaPeerAccessPattern(vector >* pattern) { } bool TensorCoreAvailable() { - // requires CUDA 9.0 and above -#if CUDA_VERSION < 9000 - return false; -#else int device = CaffeCudaGetDevice(); auto& prop = GetDeviceProperty(device); return prop.major >= 7; -#endif } const char* cublasGetErrorString(cublasStatus_t error) { @@ -238,23 +233,18 @@ const char* cublasGetErrorString(cublasStatus_t error) { return "CUBLAS_STATUS_INVALID_VALUE"; case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; #ifndef __HIP_PLATFORM_HCC__ case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; -#endif - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; -#if CUDA_VERSION >= 6000 case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; -#if CUDA_VERSION >= 6050 case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; -#endif // CUDA_VERSION >= 6050 -#endif // CUDA_VERSION >= 6000 -#ifdef __HIP_PLATFORM_HCC__ +#else case rocblas_status_invalid_size: return "rocblas_status_invalid_size"; case rocblas_status_perf_degraded: diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h index 4b9b09dde4a..5a070f96081 100644 --- a/caffe2/core/common_gpu.h +++ b/caffe2/core/common_gpu.h @@ -5,18 +5,14 @@ #include #include -// Disable strict aliasing errors for CUDA 9. -// The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic. -// It is included by cusparse.h as well, so guarding the -// inclusion of that header here is not enough. -#if CUDA_VERSION >= 9000 +#ifndef __HIP_PLATFORM_HCC__ #ifdef __GNUC__ #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) #pragma GCC diagnostic push #endif #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ -#endif // CUDA_VERSION >= 9000 +#endif // __HIP_PLATFORM_HCC__ #include #include @@ -42,9 +38,7 @@ // CAFFE_HAS_CUDA_FP16 manually. #ifndef CAFFE_HAS_CUDA_FP16 -#if CUDA_VERSION >= 7050 || defined(__HIP_PLATFORM_HCC__) #define CAFFE_HAS_CUDA_FP16 -#endif // CUDA_VERSION >= 7050 #endif // CAFFE_HAS_CUDA_FP16 #ifdef CAFFE_HAS_CUDA_FP16 @@ -59,13 +53,13 @@ constexpr int kFp16CUDADevicePropMajor = 3; #endif // Re-enable strict aliasing diagnostic if it was disabled. -#if CUDA_VERSION >= 9000 +#ifndef __HIP_PLATFORM_HCC__ #ifdef __GNUC__ #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) #pragma GCC diagnostic pop #endif #endif // __GNUC__ -#endif // CUDA_VERSION >= 9000 +#endif // __HIP_PLATFORM_HCC__ /** * The maximum number of peers that each gpu can have when doing p2p setup. @@ -78,12 +72,12 @@ constexpr int kFp16CUDADevicePropMajor = 3; namespace caffe2 { -#if CUDA_VERSION >= 9000 +#ifndef __HIP_PLATFORM_HCC__ /** * Empty class to identify TensorCore-based math */ class TensorCoreEngine {}; -#endif +#endif // __HIP_PLATFORM_HCC__ #if CUDA_VERSION >= 10000 #define CAFFE2_CUDA_PTRATTR_MEMTYPE type diff --git a/caffe2/operators/batch_matmul_op.cu b/caffe2/operators/batch_matmul_op.cu index b0ce9a31a16..7bcaac97a15 100644 --- a/caffe2/operators/batch_matmul_op.cu +++ b/caffe2/operators/batch_matmul_op.cu @@ -11,7 +11,8 @@ bool BatchMatMulOp::RunOnDevice() { REGISTER_CUDA_OPERATOR(BatchMatMul, BatchMatMulOp); -#if CUDA_VERSION >= 9000 + +#ifndef __HIP_PLATFORM_HCC__ template <> bool BatchMatMulOp::RunOnDevice() { diff --git a/caffe2/operators/fully_connected_op_gpu.cc b/caffe2/operators/fully_connected_op_gpu.cc index 4762692ee28..096e302737a 100644 --- a/caffe2/operators/fully_connected_op_gpu.cc +++ b/caffe2/operators/fully_connected_op_gpu.cc @@ -138,7 +138,8 @@ bool FullyConnectedGradientOp< return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this); } -#if CUDA_VERSION >= 9000 + +#ifndef __HIP_PLATFORM_HCC__ // Require these to be defined otherwise TensorCore FC ops will end // up calling the default FC implementation which doesn't have @@ -190,7 +191,8 @@ REGISTER_CUDA_OPERATOR( DefaultEngine, false /* don't transpose weight */>); -#if CUDA_VERSION >= 9000 +#ifndef __HIP_PLATFORM_HCC__ + REGISTER_CUDA_OPERATOR_WITH_ENGINE( FC, TENSORCORE, @@ -214,6 +216,7 @@ REGISTER_CUDA_OPERATOR_WITH_ENGINE( CUDAContext, TensorCoreEngine, false /* don't transpose weight */>); + #endif } // namespace caffe2 diff --git a/caffe2/operators/top_k_heap_selection.cuh b/caffe2/operators/top_k_heap_selection.cuh index 921266c0c9e..e6bb1226d58 100644 --- a/caffe2/operators/top_k_heap_selection.cuh +++ b/caffe2/operators/top_k_heap_selection.cuh @@ -127,11 +127,7 @@ warpHeap(K k, V v, K& keyHeapHead, K* keyHeap, V* valueHeap) { int index = __popcll(getLaneMaskLt() & vote); int total = __popcll(vote); #else -#if CUDA_VERSION >= 9000 unsigned int vote = __ballot_sync(__activemask(), wantInsert); -#else - unsigned int vote = __ballot(wantInsert); -#endif if (!vote) { // Everything the warp has is smaller than our heap diff --git a/caffe2/operators/top_k_radix_selection.cuh b/caffe2/operators/top_k_radix_selection.cuh index adc9ff141c9..948d8577bee 100644 --- a/caffe2/operators/top_k_radix_selection.cuh +++ b/caffe2/operators/top_k_radix_selection.cuh @@ -171,11 +171,7 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize], #if defined(__HIP_PLATFORM_HCC__) counts[j] += __popcll(__ballot(vote)); #else -#if CUDA_VERSION >= 9000 counts[j] += __popc(__ballot_sync(__activemask(), vote)); -#else - counts[j] += __popc(__ballot(vote)); -#endif #endif // __HIP_PLATFORM_HCC__ } } diff --git a/caffe2/utils/GpuScanUtils.cuh b/caffe2/utils/GpuScanUtils.cuh index 24ae38c0765..af257765197 100644 --- a/caffe2/utils/GpuScanUtils.cuh +++ b/caffe2/utils/GpuScanUtils.cuh @@ -68,12 +68,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti T index = __popcll(getLaneMaskLe() & vote); T carry = __popcll(vote); #else -#if CUDA_VERSION >= 9000 T vote = __ballot_sync(__activemask(), in); -#else - T vote = __ballot(in); -#endif // CUDA_VERSION - T index = __popc(getLaneMaskLe() & vote); T carry = __popc(vote); #endif // __HIP_PLATFORM_HCC__ diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index 9d148046a19..bf67054349c 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -865,24 +865,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched( const cublasOperation_t cu_trans_B = (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; if (math_type == TensorProto_DataType_FLOAT) { -#if CUDA_VERSION < 9010 - // loop over matrices in the batch - for (int i = 0; i < batch_size; ++i) { - Gemm( - trans_A, - trans_B, - M, - N, - K, - alpha, - A[i], - B[i], - beta, - C[i], - context, - math_type); - } -#else thrust::device_vector A_device(A, A + batch_size); thrust::device_vector B_device(B, B + batch_size); thrust::device_vector C_device(C, C + batch_size); @@ -909,7 +891,6 @@ CAFFE2_CUDA_EXPORT void GemmBatched( batch_size, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); -#endif } else if (math_type == TensorProto_DataType_FLOAT16) { // Convert alpha, beta from float -> __half const __half alpha_fp16 = at::Half(alpha); @@ -989,16 +970,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched( const cublasOperation_t cu_trans_B = (trans_B == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; if (math_type == TensorProto_DataType_FLOAT) { -#if CUDA_VERSION < 9010 && !defined(__HIP_PLATFORM_HCC__) - // loop over matrices in the batch - for (int i = 0; i < batch_size; ++i) { - Gemm( - trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context, math_type); - A += A_stride; - B += B_stride; - C += C_stride; - } -#else CUBLAS_ENFORCE(cublasSetPointerMode( context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); #ifdef __HIP_PLATFORM_HCC__ @@ -1060,7 +1031,6 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched( CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); #endif // __HIP_PLATFORM_HCC__ -#endif } else if (math_type == TensorProto_DataType_FLOAT16) { // Convert alpha, beta from float -> __half const __half alpha_fp16 = at::Half(alpha); @@ -1223,7 +1193,7 @@ CAFFE2_CUDA_EXPORT void Gemv( } } -#if CUDA_VERSION >= 9000 +#ifndef __HIP_PLATFORM_HCC__ // No change, but required. Defer to default CUDA engine template <> @@ -1473,7 +1443,7 @@ CAFFE2_CUDA_EXPORT void Gemv( trans_A, M, N, alpha, A, x, beta, y, context, math_type); } -#endif // CUDA_VERSION >= 9000 +#endif template <> CAFFE2_CUDA_EXPORT void GemmEx( @@ -1692,7 +1662,7 @@ CAFFE2_CUDA_EXPORT void Dot( CUDAContext* context) { #if defined __HIP_PLATFORM_HCC__ && HIP_VERSION < 210 CAFFE_THROW("HIP currently does not support FP16 completely yet."); -#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210 +#elif defined __HIP_PLATFORM_HCC__ && HIP_VERSION >= 210 CUBLAS_ENFORCE(cublasSetPointerMode( context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); CUBLAS_ENFORCE(rocblas_hdot(